1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
6 from bup.helpers import *
7 from bup import _helpers, path
10 SEEK_END=2 # os.SEEK_END is not defined in python 2.4
12 """Discussion of bloom constants for bup:
14 There are four basic things to consider when building a bloom filter:
15 The size, in bits, of the filter
16 The capacity, in entries, of the filter
17 The probability of a false positive that is tolerable
18 The number of bits readily available to use for addresing filter bits
20 There is one major tunable that is not directly related to the above:
21 k: the number of bits set in the filter per entry
23 Here's a wall of numbers showing the relationship between k; the ratio between
24 the filter size in bits and the entries in the filter; and pfalse_positive:
26 mn|k=3 |k=4 |k=5 |k=6 |k=7 |k=8 |k=9 |k=10 |k=11
27 8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
28 9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
29 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
30 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
31 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
32 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
33 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
34 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
35 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
36 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
37 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
38 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
39 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
40 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
41 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
42 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
43 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
44 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
45 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
46 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
47 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
48 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
49 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
50 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
51 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
53 Here's a table showing available repository size for a given pfalse_positive
54 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
55 filter and 8192bytes per object):
57 pfalse|obj k=4 |cap k=4 |obj k=5 |cap k=5 |obj k=6 |cap k=6
58 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
59 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
60 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
62 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
65 filter size scales linearly with repository size for a given k and pfalse.
67 Here's a table of filter sizes for a 1 TiB repository:
69 pfalse| k=3 | k=4 | k=5 | k=6
70 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
71 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
72 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
75 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
76 per lookup will be worse than the two required for midx.
77 * We want the pfalse_positive to be low enough that the cost of sometimes
78 faulting on the midx doesn't overcome the benefit of the bloom filter.
79 * We have readily available 160 bits for addressing the filter.
80 * We want to be able to have a single bloom address entire repositories of
83 Based on these parameters, a combination of k=4 and k=5 provides the behavior
84 that bup needs. As such, I've implemented bloom addressing, adding and
85 checking functions in C for these two values. Because k=5 requires less space
86 and gives better overall pfalse_positive perofrmance, it is preferred if a
87 table with k=5 can represent the repository.
89 None of this tells us what max_pfalse_positive to choose.
91 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
94 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
95 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
96 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
100 home_repodir = os.path.expanduser('~/.bup')
103 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
104 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
110 class GitError(Exception):
115 """Get the path to the git repository or one of its subdirectories."""
118 raise GitError('You should call check_repo_or_die()')
120 # If there's a .git subdirectory, then the actual repo is in there.
121 gd = os.path.join(repodir, '.git')
122 if os.path.exists(gd):
125 return os.path.join(repodir, sub)
129 full = os.path.abspath(path)
130 fullrepo = os.path.abspath(repo(''))
131 if not fullrepo.endswith('/'):
133 if full.startswith(fullrepo):
134 path = full[len(fullrepo):]
135 if path.startswith('index-cache/'):
136 path = path[len('index-cache/'):]
141 paths = [repo('objects/pack')]
142 paths += glob.glob(repo('index-cache/*/.'))
146 def auto_midx(objdir):
147 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
149 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
151 # make sure 'args' gets printed to help with debugging
152 add_error('%r: exception: %s' % (args, e))
155 add_error('%r: returned %d' % (args, rv))
157 args = [path.exe(), 'bloom', '--dir', objdir]
159 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
161 # make sure 'args' gets printed to help with debugging
162 add_error('%r: exception: %s' % (args, e))
165 add_error('%r: returned %d' % (args, rv))
168 def mangle_name(name, mode, gitmode):
169 """Mangle a file name to present an abstract name for segmented files.
170 Mangled file names will have the ".bup" extension added to them. If a
171 file's name already ends with ".bup", a ".bupl" extension is added to
172 disambiguate normal files from semgmented ones.
174 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
176 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
177 return name + '.bupl'
182 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
183 def demangle_name(name):
184 """Remove name mangling from a file name, if necessary.
186 The return value is a tuple (demangled_filename,mode), where mode is one of
189 * BUP_NORMAL : files that should be read as-is from the repository
190 * BUP_CHUNKED : files that were chunked and need to be assembled
192 For more information on the name mangling algorythm, see mangle_name()
194 if name.endswith('.bupl'):
195 return (name[:-5], BUP_NORMAL)
196 elif name.endswith('.bup'):
197 return (name[:-4], BUP_CHUNKED)
199 return (name, BUP_NORMAL)
202 def _encode_packobj(type, content):
205 szbits = (sz & 0x0f) | (_typemap[type]<<4)
208 if sz: szbits |= 0x80
214 z = zlib.compressobj(1)
216 yield z.compress(content)
220 def _encode_looseobj(type, content):
221 z = zlib.compressobj(1)
222 yield z.compress('%s %d\0' % (type, len(content)))
223 yield z.compress(content)
227 def _decode_looseobj(buf):
229 s = zlib.decompress(buf)
236 assert(type in _typemap)
237 assert(sz == len(content))
238 return (type, content)
241 def _decode_packobj(buf):
244 type = _typermap[(c & 0x70) >> 4]
251 sz |= (c & 0x7f) << shift
255 return (type, zlib.decompress(buf[i+1:]))
262 def find_offset(self, hash):
263 """Get the offset of an object inside the index file."""
264 idx = self._idx_from_hash(hash)
266 return self._ofs_from_idx(idx)
269 def exists(self, hash, want_source=False):
270 """Return nonempty if the object exists in this index."""
271 if hash and (self._idx_from_hash(hash) != None):
272 return want_source and os.path.basename(self.name) or True
276 return int(self.fanout[255])
278 def _idx_from_hash(self, hash):
279 global _total_searches, _total_steps
281 assert(len(hash) == 20)
283 start = self.fanout[b1-1] # range -1..254
284 end = self.fanout[b1] # range 0..255
286 _total_steps += 1 # lookup table is a step
289 mid = start + (end-start)/2
290 v = self._idx_to_hash(mid)
300 class PackIdxV1(PackIdx):
301 """Object representation of a Git pack index (version 1) file."""
302 def __init__(self, filename, f):
304 self.idxnames = [self.name]
305 self.map = mmap_read(f)
306 self.fanout = list(struct.unpack('!256I',
307 str(buffer(self.map, 0, 256*4))))
308 self.fanout.append(0) # entry "-1"
309 nsha = self.fanout[255]
311 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
313 def _ofs_from_idx(self, idx):
314 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
316 def _idx_to_hash(self, idx):
317 return str(self.shatable[idx*24+4 : idx*24+24])
320 for i in xrange(self.fanout[255]):
321 yield buffer(self.map, 256*4 + 24*i + 4, 20)
324 class PackIdxV2(PackIdx):
325 """Object representation of a Git pack index (version 2) file."""
326 def __init__(self, filename, f):
328 self.idxnames = [self.name]
329 self.map = mmap_read(f)
330 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
331 self.fanout = list(struct.unpack('!256I',
332 str(buffer(self.map, 8, 256*4))))
333 self.fanout.append(0) # entry "-1"
334 nsha = self.fanout[255]
335 self.sha_ofs = 8 + 256*4
336 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
337 self.ofstable = buffer(self.map,
338 self.sha_ofs + nsha*20 + nsha*4,
340 self.ofs64table = buffer(self.map,
341 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
343 def _ofs_from_idx(self, idx):
344 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
346 idx64 = ofs & 0x7fffffff
347 ofs = struct.unpack('!Q',
348 str(buffer(self.ofs64table, idx64*8, 8)))[0]
351 def _idx_to_hash(self, idx):
352 return str(self.shatable[idx*20:(idx+1)*20])
355 for i in xrange(self.fanout[255]):
356 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
359 extract_bits = _helpers.extract_bits
361 bloom_contains = _helpers.bloom_contains
362 bloom_add = _helpers.bloom_add
366 """Wrapper which contains data from multiple index files.
368 def __init__(self, filename, f=None, readwrite=False, expected=-1):
372 assert(filename.endswith('.bloom'))
375 self.rwfile = f = f or open(filename, 'r+b')
378 # Decide if we want to mmap() the pages as writable ('immediate'
379 # write) or else map them privately for later writing back to
380 # the file ('delayed' write). A bloom table's write access
381 # pattern is such that we dirty almost all the pages after adding
382 # very few entries. But the table is so big that dirtying
383 # *all* the pages often exceeds Linux's default
384 # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
385 # thus causing it to start flushing the table before we're
386 # finished... even though there's more than enough space to
387 # store the bloom table in RAM.
389 # To work around that behaviour, if we calculate that we'll
390 # probably end up touching the whole table anyway (at least
391 # one bit flipped per memory page), let's use a "private" mmap,
392 # which defeats Linux's ability to flush it to disk. Then we'll
393 # flush it as one big lump during close().
394 pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
395 self.delaywrite = expected > pages
396 debug1('bloom: delaywrite=%r\n' % self.delaywrite)
398 self.map = mmap_readwrite_private(self.rwfile, close=False)
400 self.map = mmap_readwrite(self.rwfile, close=False)
403 f = f or open(filename, 'rb')
404 self.map = mmap_read(f)
405 got = str(self.map[0:4])
407 log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
408 return self._init_failed()
409 ver = struct.unpack('!I', self.map[4:8])[0]
410 if ver < BLOOM_VERSION:
411 log('Warning: ignoring old-style (v%d) bloom %r\n'
413 return self._init_failed()
414 if ver > BLOOM_VERSION:
415 log('Warning: ignoring too-new (v%d) bloom %r\n'
417 return self._init_failed()
419 self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
420 idxnamestr = str(self.map[16 + 2**self.bits:])
422 self.idxnames = idxnamestr.split('\0')
426 def _init_failed(self):
433 self.bits = self.entries = 0
436 return self.map and self.bits
442 if self.map and self.rwfile:
443 debug2("bloom: closing with %d entries\n" % self.entries)
444 self.map[12:16] = struct.pack('!I', self.entries)
447 self.rwfile.write(self.map)
450 self.rwfile.seek(16 + 2**self.bits)
452 self.rwfile.write('\0'.join(self.idxnames))
455 def pfalse_positive(self, additional=0):
456 n = self.entries + additional
459 return 100*(1-math.exp(-k*float(n)/m))**k
461 def add_idx(self, ix):
462 """Add the object to the filter, return current pfalse_positive."""
463 if not self.map: raise Exception, "Cannot add to closed bloom"
464 self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
465 self.idxnames.append(os.path.basename(ix.name))
467 def exists(self, sha):
468 """Return nonempty if the object probably exists in the bloom filter."""
469 global _total_searches, _total_steps
471 if not self.map: return None
472 found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
473 _total_steps += steps
477 def create(cls, name, expected, delaywrite=None, f=None, k=None):
478 """Create and return a bloom filter for `expected` entries."""
479 bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
480 k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
481 if bits > MAX_BLOOM_BITS[k]:
482 log('bloom: warning, max bits exceeded, non-optimal\n')
483 bits = MAX_BLOOM_BITS[k]
484 debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
485 f = f or open(name, 'w+b')
487 f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
488 assert(f.tell() == 16)
489 # NOTE: On some systems this will not extend+zerofill, but it does on
490 # darwin, linux, bsd and solaris.
491 f.truncate(16+2**bits)
493 if delaywrite != None and not delaywrite:
494 # tell it to expect very few objects, forcing a direct mmap
496 return cls(name, f=f, readwrite=True, expected=expected)
499 return int(self.entries)
503 """Wrapper which contains data from multiple index files.
504 Multiple index (.midx) files constitute a wrapper around index (.idx) files
505 and make it possible for bup to expand Git's indexing capabilities to vast
508 def __init__(self, filename):
510 self.force_keep = False
511 assert(filename.endswith('.midx'))
512 self.map = mmap_read(open(filename))
513 if str(self.map[0:4]) != 'MIDX':
514 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
515 self.force_keep = True
516 return self._init_failed()
517 ver = struct.unpack('!I', self.map[4:8])[0]
518 if ver < MIDX_VERSION:
519 log('Warning: ignoring old-style (v%d) midx %r\n'
521 self.force_keep = False # old stuff is boring
522 return self._init_failed()
523 if ver > MIDX_VERSION:
524 log('Warning: ignoring too-new (v%d) midx %r\n'
526 self.force_keep = True # new stuff is exciting
527 return self._init_failed()
529 self.bits = _helpers.firstword(self.map[8:12])
530 self.entries = 2**self.bits
531 self.fanout = buffer(self.map, 12, self.entries*4)
532 self.sha_ofs = 12 + self.entries*4
533 self.nsha = nsha = self._fanget(self.entries-1)
534 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
535 self.which_ofs = self.sha_ofs + 20*nsha
536 self.whichlist = buffer(self.map, self.which_ofs, nsha*4)
537 self.idxnames = str(self.map[self.which_ofs + 4*nsha:]).split('\0')
539 def _init_failed(self):
542 self.fanout = buffer('\0\0\0\0')
543 self.shatable = buffer('\0'*20)
546 def _fanget(self, i):
548 s = self.fanout[start:start+4]
549 return _helpers.firstword(s)
552 return str(self.shatable[i*20:(i+1)*20])
554 def _get_idx_i(self, i):
555 return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
557 def _get_idxname(self, i):
558 return self.idxnames[self._get_idx_i(i)]
560 def exists(self, hash, want_source=False):
561 """Return nonempty if the object exists in the index files."""
562 global _total_searches, _total_steps
565 el = extract_bits(want, self.bits)
567 start = self._fanget(el-1)
568 startv = el << (32-self.bits)
572 end = self._fanget(el)
573 endv = (el+1) << (32-self.bits)
574 _total_steps += 1 # lookup table is a step
575 hashv = _helpers.firstword(hash)
576 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
579 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
580 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
581 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
583 #print ' %08x' % self._num(v)
586 startv = _helpers.firstword(v)
589 endv = _helpers.firstword(v)
591 return want_source and self._get_idxname(mid) or True
595 for i in xrange(self._fanget(self.entries-1)):
596 yield buffer(self.shatable, i*20, 20)
599 return int(self._fanget(self.entries-1))
604 def __init__(self, dir):
606 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
611 self.do_bloom = False
618 assert(_mpi_count == 0)
621 return iter(idxmerge(self.packs))
624 return sum(len(pack) for pack in self.packs)
626 def exists(self, hash, want_source=False):
627 """Return nonempty if the object exists in the index files."""
628 global _total_searches
630 if hash in self.also:
632 if self.do_bloom and self.bloom is not None:
633 _total_searches -= 1 # will be incremented by bloom
634 if self.bloom.exists(hash):
635 self.do_bloom = False
638 for i in xrange(len(self.packs)):
640 _total_searches -= 1 # will be incremented by sub-pack
641 ix = p.exists(hash, want_source=want_source)
643 # reorder so most recently used packs are searched first
644 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
649 def refresh(self, skip_midx = False):
650 """Refresh the index list.
651 This method verifies if .midx files were superseded (e.g. all of its
652 contents are in another, bigger .midx file) and removes the superseded
655 If skip_midx is True, all work on .midx files will be skipped and .midx
656 files will be removed from the list.
658 The module-global variable 'ignore_midx' can force this function to
659 always act as if skip_midx was True.
661 self.bloom = None # Always reopen the bloom as it may have been relaced
662 self.do_bloom = False
663 skip_midx = skip_midx or ignore_midx
664 d = dict((p.name, p) for p in self.packs
665 if not skip_midx or not isinstance(p, PackMidx))
666 if os.path.exists(self.dir):
669 for ix in self.packs:
670 if isinstance(ix, PackMidx):
671 for name in ix.idxnames:
672 d[os.path.join(self.dir, name)] = ix
673 for full in glob.glob(os.path.join(self.dir,'*.midx')):
676 (mxd, mxf) = os.path.split(mx.name)
678 for n in mx.idxnames:
679 if not os.path.exists(os.path.join(mxd, n)):
680 log(('warning: index %s missing\n' +
681 ' used by %s\n') % (n, mxf))
688 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
691 for sub in ix.idxnames:
692 found = d.get(os.path.join(self.dir, sub))
693 if not found or isinstance(found, PackIdx):
694 # doesn't exist, or exists but not in a midx
699 for name in ix.idxnames:
700 d[os.path.join(self.dir, name)] = ix
701 elif not ix.force_keep:
702 debug1('midx: removing redundant: %s\n'
703 % os.path.basename(ix.name))
705 for full in glob.glob(os.path.join(self.dir,'*.idx')):
713 bfull = os.path.join(self.dir, 'bup.bloom')
714 if self.bloom is None and os.path.exists(bfull):
715 self.bloom = ShaBloom(bfull)
716 self.packs = list(set(d.values()))
717 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
718 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
722 debug1('PackIdxList: using %d index%s.\n'
723 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
726 """Insert an additional object in the list."""
730 def calc_hash(type, content):
731 """Calculate some content's hash in the Git fashion."""
732 header = '%s %d\0' % (type, len(content))
738 def _shalist_sort_key(ent):
739 (mode, name, id) = ent
740 if stat.S_ISDIR(int(mode, 8)):
746 def open_idx(filename):
747 if filename.endswith('.idx'):
748 f = open(filename, 'rb')
750 if header[0:4] == '\377tOc':
751 version = struct.unpack('!I', header[4:8])[0]
753 return PackIdxV2(filename, f)
755 raise GitError('%s: expected idx file version 2, got %d'
756 % (filename, version))
757 elif len(header) == 8 and header[0:4] < '\377tOc':
758 return PackIdxV1(filename, f)
760 raise GitError('%s: unrecognized idx file header' % filename)
761 elif filename.endswith('.midx'):
762 return PackMidx(filename)
764 raise GitError('idx filenames must end with .idx or .midx')
767 def idxmerge(idxlist, final_progress=True):
768 """Generate a list of all the objects reachable in a PackIdxList."""
769 def pfunc(count, total):
770 qprogress('Reading indexes: %.2f%% (%d/%d)\r'
771 % (count*100.0/total, count, total))
772 def pfinal(count, total):
774 progress('Reading indexes: %.2f%% (%d/%d), done.\n'
775 % (100, total, total))
776 return merge_iter(idxlist, 10024, pfunc, pfinal)
779 def _make_objcache():
780 return PackIdxList(repo('objects/pack'))
783 """Writes Git objects insid a pack file."""
784 def __init__(self, objcache_maker=_make_objcache):
790 self.objcache_maker = objcache_maker
798 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
799 self.file = os.fdopen(fd, 'w+b')
800 assert(name.endswith('.pack'))
801 self.filename = name[:-5]
802 self.file.write('PACK\0\0\0\2\0\0\0\0')
803 self.idx = list(list() for i in xrange(256))
805 def _raw_write(self, datalist, sha):
808 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
809 # the file never has a *partial* blob. So let's make sure it's
810 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
811 # to our hashsplit algorithm.) f.write() does its own buffering,
812 # but that's okay because we'll flush it in _end().
813 oneblob = ''.join(datalist)
817 raise GitError, e, sys.exc_info()[2]
819 crc = zlib.crc32(oneblob) & 0xffffffff
820 self._update_idx(sha, crc, nw)
825 def _update_idx(self, sha, crc, size):
828 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
830 def _write(self, sha, type, content):
834 sha = calc_hash(type, content)
835 size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
838 def breakpoint(self):
839 """Clear byte and object counts and return the last processed id."""
841 self.outbytes = self.count = 0
844 def _require_objcache(self):
845 if self.objcache is None and self.objcache_maker:
846 self.objcache = self.objcache_maker()
847 if self.objcache is None:
849 "PackWriter not opened or can't check exists w/o objcache")
851 def exists(self, id, want_source=False):
852 """Return non-empty if an object is found in the object cache."""
853 self._require_objcache()
854 return self.objcache.exists(id, want_source=want_source)
856 def maybe_write(self, type, content):
857 """Write an object to the pack file if not present and return its id."""
858 self._require_objcache()
859 sha = calc_hash(type, content)
860 if not self.exists(sha):
861 self._write(sha, type, content)
862 self.objcache.add(sha)
865 def new_blob(self, blob):
866 """Create a blob object in the pack with the supplied content."""
867 return self.maybe_write('blob', blob)
869 def new_tree(self, shalist):
870 """Create a tree object in the pack."""
871 shalist = sorted(shalist, key = _shalist_sort_key)
873 for (mode,name,bin) in shalist:
876 assert(mode[0] != '0')
878 assert(len(bin) == 20)
879 l.append('%s %s\0%s' % (mode,name,bin))
880 return self.maybe_write('tree', ''.join(l))
882 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
884 if tree: l.append('tree %s' % tree.encode('hex'))
885 if parent: l.append('parent %s' % parent.encode('hex'))
886 if author: l.append('author %s %s' % (author, _git_date(adate)))
887 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
890 return self.maybe_write('commit', '\n'.join(l))
892 def new_commit(self, parent, tree, date, msg):
893 """Create a commit object in the pack."""
894 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
895 commit = self._new_commit(tree, parent,
896 userline, date, userline, date,
901 """Remove the pack file from disk."""
907 os.unlink(self.filename + '.pack')
909 def _end(self, run_midx=True):
911 if not f: return None
917 # update object count
919 cp = struct.pack('!i', self.count)
923 # calculate the pack sha1sum
926 for b in chunkyreader(f):
928 packbin = sum.digest()
932 obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
934 nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
935 if os.path.exists(self.filename + '.map'):
936 os.unlink(self.filename + '.map')
937 os.rename(self.filename + '.pack', nameprefix + '.pack')
938 os.rename(self.filename + '.idx', nameprefix + '.idx')
941 auto_midx(repo('objects/pack'))
944 def close(self, run_midx=True):
945 """Close the pack file and move it to its definitive path."""
946 return self._end(run_midx=run_midx)
948 def _write_pack_idx_v2(self, filename, idx, packbin):
949 idx_f = open(filename, 'w+b')
950 idx_f.write('\377tOc\0\0\0\2')
952 ofs64_ofs = 8 + 4*256 + 28*self.count
953 idx_f.truncate(ofs64_ofs)
955 idx_map = mmap_readwrite(idx_f, close=False)
956 idx_f.seek(0, SEEK_END)
957 count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
958 assert(count == self.count)
964 b = idx_f.read(8 + 4*256)
967 obj_list_sum = Sha1()
968 for b in chunkyreader(idx_f, 20*self.count):
970 obj_list_sum.update(b)
971 namebase = obj_list_sum.hexdigest()
973 for b in chunkyreader(idx_f):
975 idx_f.write(idx_sum.digest())
982 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
986 os.environ['GIT_DIR'] = os.path.abspath(repo())
989 def list_refs(refname = None):
990 """Generate a list of tuples in the form (refname,hash).
991 If a ref name is specified, list only this particular ref.
993 argv = ['git', 'show-ref', '--']
996 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
997 out = p.stdout.read().strip()
998 rv = p.wait() # not fatal
1002 for d in out.split('\n'):
1003 (sha, name) = d.split(' ', 1)
1004 yield (name, sha.decode('hex'))
1007 def read_ref(refname):
1008 """Get the commit id of the most recent commit made on a given ref."""
1009 l = list(list_refs(refname))
1017 def rev_list(ref, count=None):
1018 """Generate a list of reachable commits in reverse chronological order.
1020 This generator walks through commits, from child to parent, that are
1021 reachable via the specified ref and yields a series of tuples of the form
1024 If count is a non-zero integer, limit the number of commits to "count"
1027 assert(not ref.startswith('-'))
1030 opts += ['-n', str(atoi(count))]
1031 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
1032 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
1034 for row in p.stdout:
1036 if s.startswith('commit '):
1037 commit = s[7:].decode('hex')
1040 yield (date, commit)
1041 rv = p.wait() # not fatal
1043 raise GitError, 'git rev-list returned error %d' % rv
1046 def rev_get_date(ref):
1047 """Get the date of the latest commit on the specified ref."""
1048 for (date, commit) in rev_list(ref, count=1):
1050 raise GitError, 'no such commit %r' % ref
1053 def rev_parse(committish):
1054 """Resolve the full hash for 'committish', if it exists.
1056 Should be roughly equivalent to 'git rev-parse'.
1058 Returns the hex value of the hash if it is found, None if 'committish' does
1059 not correspond to anything.
1061 head = read_ref(committish)
1063 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1066 pL = PackIdxList(repo('objects/pack'))
1068 if len(committish) == 40:
1070 hash = committish.decode('hex')
1080 def update_ref(refname, newval, oldval):
1081 """Change the commit pointed to by a branch."""
1084 assert(refname.startswith('refs/heads/'))
1085 p = subprocess.Popen(['git', 'update-ref', refname,
1086 newval.encode('hex'), oldval.encode('hex')],
1087 preexec_fn = _gitenv)
1088 _git_wait('git update-ref', p)
1091 def guess_repo(path=None):
1092 """Set the path value in the global variable "repodir".
1093 This makes bup look for an existing bup repository, but not fail if a
1094 repository doesn't exist. Usually, if you are interacting with a bup
1095 repository, you would not be calling this function but using
1096 check_repo_or_die().
1102 repodir = os.environ.get('BUP_DIR')
1104 repodir = os.path.expanduser('~/.bup')
1107 def init_repo(path=None):
1108 """Create the Git bare repository for bup in a given path."""
1110 d = repo() # appends a / to the path
1111 parent = os.path.dirname(os.path.dirname(d))
1112 if parent and not os.path.exists(parent):
1113 raise GitError('parent directory "%s" does not exist\n' % parent)
1114 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1115 raise GitError('"%d" exists but is not a directory\n' % d)
1116 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1117 preexec_fn = _gitenv)
1118 _git_wait('git init', p)
1119 # Force the index version configuration in order to ensure bup works
1120 # regardless of the version of the installed Git binary.
1121 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1122 stdout=sys.stderr, preexec_fn = _gitenv)
1123 _git_wait('git config', p)
1126 def check_repo_or_die(path=None):
1127 """Make sure a bup repository exists, and abort if not.
1128 If the path to a particular repository was not specified, this function
1129 initializes the default repository automatically.
1132 if not os.path.isdir(repo('objects/pack/.')):
1133 if repodir == home_repodir:
1136 log('error: %r is not a bup/git repository\n' % repo())
1141 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1143 while ofs < len(buf):
1144 z = buf[ofs:].find('\0')
1146 spl = buf[ofs:ofs+z].split(' ', 1)
1147 assert(len(spl) == 2)
1148 sha = buf[ofs+z+1:ofs+z+1+20]
1150 yield (spl[0], spl[1], sha)
1155 """Get Git's version and ensure a usable version is installed.
1157 The returned version is formatted as an ordered tuple with each position
1158 representing a digit in the version tag. For example, the following tuple
1159 would represent version 1.6.6.9:
1161 ('1', '6', '6', '9')
1165 p = subprocess.Popen(['git', '--version'],
1166 stdout=subprocess.PIPE)
1167 gvs = p.stdout.read()
1168 _git_wait('git --version', p)
1169 m = re.match(r'git version (\S+.\S+)', gvs)
1171 raise GitError('git --version weird output: %r' % gvs)
1172 _ver = tuple(m.group(1).split('.'))
1173 needed = ('1','5', '3', '1')
1175 raise GitError('git version %s or higher is required; you have %s'
1176 % ('.'.join(needed), '.'.join(_ver)))
1180 def _git_wait(cmd, p):
1183 raise GitError('%s returned %d' % (cmd, rv))
1186 def _git_capture(argv):
1187 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1189 _git_wait(repr(argv), p)
1193 class _AbortableIter:
1194 def __init__(self, it, onabort = None):
1196 self.onabort = onabort
1204 return self.it.next()
1205 except StopIteration, e:
1213 """Abort iteration and call the abortion callback, if needed."""
1225 """Link to 'git cat-file' that is used to retrieve blob data."""
1228 wanted = ('1','5','6')
1231 log('warning: git version < %s; bup will be slow.\n'
1234 self.get = self._slow_get
1236 self.p = self.inprogress = None
1237 self.get = self._fast_get
1241 self.p.stdout.close()
1242 self.p.stdin.close()
1244 self.inprogress = None
1248 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1249 stdin=subprocess.PIPE,
1250 stdout=subprocess.PIPE,
1253 preexec_fn = _gitenv)
1255 def _fast_get(self, id):
1256 if not self.p or self.p.poll() != None:
1259 assert(self.p.poll() == None)
1261 log('_fast_get: opening %r while %r is open'
1262 % (id, self.inprogress))
1263 assert(not self.inprogress)
1264 assert(id.find('\n') < 0)
1265 assert(id.find('\r') < 0)
1266 assert(not id.startswith('-'))
1267 self.inprogress = id
1268 self.p.stdin.write('%s\n' % id)
1269 self.p.stdin.flush()
1270 hdr = self.p.stdout.readline()
1271 if hdr.endswith(' missing\n'):
1272 self.inprogress = None
1273 raise KeyError('blob %r is missing' % id)
1274 spl = hdr.split(' ')
1275 if len(spl) != 3 or len(spl[0]) != 40:
1276 raise GitError('expected blob, got %r' % spl)
1277 (hex, type, size) = spl
1279 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1280 onabort = self._abort)
1285 assert(self.p.stdout.readline() == '\n')
1286 self.inprogress = None
1287 except Exception, e:
1291 def _slow_get(self, id):
1292 assert(id.find('\n') < 0)
1293 assert(id.find('\r') < 0)
1294 assert(id[0] != '-')
1295 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1298 p = subprocess.Popen(['git', 'cat-file', type, id],
1299 stdout=subprocess.PIPE,
1300 preexec_fn = _gitenv)
1301 for blob in chunkyreader(p.stdout):
1303 _git_wait('git cat-file', p)
1305 def _join(self, it):
1310 elif type == 'tree':
1311 treefile = ''.join(it)
1312 for (mode, name, sha) in treeparse(treefile):
1313 for blob in self.join(sha.encode('hex')):
1315 elif type == 'commit':
1316 treeline = ''.join(it).split('\n')[0]
1317 assert(treeline.startswith('tree '))
1318 for blob in self.join(treeline[5:]):
1321 raise GitError('invalid object type %r: expected blob/tree/commit'
1325 """Generate a list of the content of all blobs that can be reached
1326 from an object. The hash given in 'id' must point to a blob, a tree
1327 or a commit. The content of all blobs that can be seen from trees or
1328 commits will be added to the list.
1331 for d in self._join(self.get(id)):
1333 except StopIteration:
1337 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1339 for (n,c) in list_refs():
1340 if n.startswith('refs/tags/'):
1345 tags[c].append(name) # more than one tag can point at 'c'