1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
6 from bup.helpers import *
7 from bup import _helpers, path
10 SEEK_END=2 # os.SEEK_END is not defined in python 2.4
12 """Discussion of bloom constants for bup:
14 There are four basic things to consider when building a bloom filter:
15 The size, in bits, of the filter
16 The capacity, in entries, of the filter
17 The probability of a false positive that is tolerable
18 The number of bits readily available to use for addresing filter bits
20 There is one major tunable that is not directly related to the above:
21 k: the number of bits set in the filter per entry
23 Here's a wall of numbers showing the relationship between k; the ratio between
24 the filter size in bits and the entries in the filter; and pfalse_positive:
26 mn|k=3 |k=4 |k=5 |k=6 |k=7 |k=8 |k=9 |k=10 |k=11
27 8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
28 9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
29 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
30 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
31 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
32 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
33 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
34 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
35 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
36 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
37 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
38 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
39 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
40 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
41 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
42 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
43 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
44 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
45 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
46 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
47 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
48 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
49 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
50 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
51 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
53 Here's a table showing available repository size for a given pfalse_positive
54 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
55 filter and 8192bytes per object):
57 pfalse|obj k=4 |cap k=4 |obj k=5 |cap k=5 |obj k=6 |cap k=6
58 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
59 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
60 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
62 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
65 filter size scales linearly with repository size for a given k and pfalse.
67 Here's a table of filter sizes for a 1 TiB repository:
69 pfalse| k=3 | k=4 | k=5 | k=6
70 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
71 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
72 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
75 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
76 per lookup will be worse than the two required for midx.
77 * We want the pfalse_positive to be low enough that the cost of sometimes
78 faulting on the midx doesn't overcome the benefit of the bloom filter.
79 * We have readily available 160 bits for addressing the filter.
80 * We want to be able to have a single bloom address entire repositories of
83 Based on these parameters, a combination of k=4 and k=5 provides the behavior
84 that bup needs. As such, I've implemented bloom addressing, adding and
85 checking functions in C for these two values. Because k=5 requires less space
86 and gives better overall pfalse_positive perofrmance, it is preferred if a
87 table with k=5 can represent the repository.
89 None of this tells us what max_pfalse_positive to choose.
91 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
94 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
95 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
96 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
100 home_repodir = os.path.expanduser('~/.bup')
103 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
104 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
110 class GitError(Exception):
115 """Get the path to the git repository or one of its subdirectories."""
118 raise GitError('You should call check_repo_or_die()')
120 # If there's a .git subdirectory, then the actual repo is in there.
121 gd = os.path.join(repodir, '.git')
122 if os.path.exists(gd):
125 return os.path.join(repodir, sub)
129 full = os.path.abspath(path)
130 fullrepo = os.path.abspath(repo(''))
131 if not fullrepo.endswith('/'):
133 if full.startswith(fullrepo):
134 path = full[len(fullrepo):]
135 if path.startswith('index-cache/'):
136 path = path[len('index-cache/'):]
141 paths = [repo('objects/pack')]
142 paths += glob.glob(repo('index-cache/*/.'))
146 def auto_midx(objdir):
147 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
149 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
151 # make sure 'args' gets printed to help with debugging
152 add_error('%r: exception: %s' % (args, e))
155 add_error('%r: returned %d' % (args, rv))
157 args = [path.exe(), 'bloom', '--dir', objdir]
159 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
161 # make sure 'args' gets printed to help with debugging
162 add_error('%r: exception: %s' % (args, e))
165 add_error('%r: returned %d' % (args, rv))
168 def mangle_name(name, mode, gitmode):
169 """Mangle a file name to present an abstract name for segmented files.
170 Mangled file names will have the ".bup" extension added to them. If a
171 file's name already ends with ".bup", a ".bupl" extension is added to
172 disambiguate normal files from semgmented ones.
174 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
176 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
177 return name + '.bupl'
182 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
183 def demangle_name(name):
184 """Remove name mangling from a file name, if necessary.
186 The return value is a tuple (demangled_filename,mode), where mode is one of
189 * BUP_NORMAL : files that should be read as-is from the repository
190 * BUP_CHUNKED : files that were chunked and need to be assembled
192 For more information on the name mangling algorythm, see mangle_name()
194 if name.endswith('.bupl'):
195 return (name[:-5], BUP_NORMAL)
196 elif name.endswith('.bup'):
197 return (name[:-4], BUP_CHUNKED)
199 return (name, BUP_NORMAL)
202 def _encode_packobj(type, content):
205 szbits = (sz & 0x0f) | (_typemap[type]<<4)
208 if sz: szbits |= 0x80
214 z = zlib.compressobj(1)
216 yield z.compress(content)
220 def _encode_looseobj(type, content):
221 z = zlib.compressobj(1)
222 yield z.compress('%s %d\0' % (type, len(content)))
223 yield z.compress(content)
227 def _decode_looseobj(buf):
229 s = zlib.decompress(buf)
236 assert(type in _typemap)
237 assert(sz == len(content))
238 return (type, content)
241 def _decode_packobj(buf):
244 type = _typermap[(c & 0x70) >> 4]
251 sz |= (c & 0x7f) << shift
255 return (type, zlib.decompress(buf[i+1:]))
262 def find_offset(self, hash):
263 """Get the offset of an object inside the index file."""
264 idx = self._idx_from_hash(hash)
266 return self._ofs_from_idx(idx)
269 def exists(self, hash, want_source=False):
270 """Return nonempty if the object exists in this index."""
271 if hash and (self._idx_from_hash(hash) != None):
272 return want_source and os.path.basename(self.name) or True
276 return int(self.fanout[255])
278 def _idx_from_hash(self, hash):
279 global _total_searches, _total_steps
281 assert(len(hash) == 20)
283 start = self.fanout[b1-1] # range -1..254
284 end = self.fanout[b1] # range 0..255
286 _total_steps += 1 # lookup table is a step
289 mid = start + (end-start)/2
290 v = self._idx_to_hash(mid)
300 class PackIdxV1(PackIdx):
301 """Object representation of a Git pack index (version 1) file."""
302 def __init__(self, filename, f):
304 self.idxnames = [self.name]
305 self.map = mmap_read(f)
306 self.fanout = list(struct.unpack('!256I',
307 str(buffer(self.map, 0, 256*4))))
308 self.fanout.append(0) # entry "-1"
309 nsha = self.fanout[255]
311 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
313 def _ofs_from_idx(self, idx):
314 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
316 def _idx_to_hash(self, idx):
317 return str(self.shatable[idx*24+4 : idx*24+24])
320 for i in xrange(self.fanout[255]):
321 yield buffer(self.map, 256*4 + 24*i + 4, 20)
324 class PackIdxV2(PackIdx):
325 """Object representation of a Git pack index (version 2) file."""
326 def __init__(self, filename, f):
328 self.idxnames = [self.name]
329 self.map = mmap_read(f)
330 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
331 self.fanout = list(struct.unpack('!256I',
332 str(buffer(self.map, 8, 256*4))))
333 self.fanout.append(0) # entry "-1"
334 nsha = self.fanout[255]
335 self.sha_ofs = 8 + 256*4
336 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
337 self.ofstable = buffer(self.map,
338 self.sha_ofs + nsha*20 + nsha*4,
340 self.ofs64table = buffer(self.map,
341 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
343 def _ofs_from_idx(self, idx):
344 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
346 idx64 = ofs & 0x7fffffff
347 ofs = struct.unpack('!Q',
348 str(buffer(self.ofs64table, idx64*8, 8)))[0]
351 def _idx_to_hash(self, idx):
352 return str(self.shatable[idx*20:(idx+1)*20])
355 for i in xrange(self.fanout[255]):
356 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
359 extract_bits = _helpers.extract_bits
361 bloom_contains = _helpers.bloom_contains
362 bloom_add = _helpers.bloom_add
366 """Wrapper which contains data from multiple index files.
368 def __init__(self, filename, f=None, readwrite=False, expected=-1):
372 assert(filename.endswith('.bloom'))
375 self.rwfile = f = f or open(filename, 'r+b')
378 # Decide if we want to mmap() the pages as writable ('immediate'
379 # write) or else map them privately for later writing back to
380 # the file ('delayed' write). A bloom table's write access
381 # pattern is such that we dirty almost all the pages after adding
382 # very few entries. But the table is so big that dirtying
383 # *all* the pages often exceeds Linux's default
384 # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
385 # thus causing it to start flushing the table before we're
386 # finished... even though there's more than enough space to
387 # store the bloom table in RAM.
389 # To work around that behaviour, if we calculate that we'll
390 # probably end up touching the whole table anyway (at least
391 # one bit flipped per memory page), let's use a "private" mmap,
392 # which defeats Linux's ability to flush it to disk. Then we'll
393 # flush it as one big lump during close().
394 pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
395 self.delaywrite = expected > pages
396 debug1('bloom: delaywrite=%r\n' % self.delaywrite)
398 self.map = mmap_readwrite_private(self.rwfile, close=False)
400 self.map = mmap_readwrite(self.rwfile, close=False)
403 f = f or open(filename, 'rb')
404 self.map = mmap_read(f)
405 got = str(self.map[0:4])
407 log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
408 return self._init_failed()
409 ver = struct.unpack('!I', self.map[4:8])[0]
410 if ver < BLOOM_VERSION:
411 log('Warning: ignoring old-style (v%d) bloom %r\n'
413 return self._init_failed()
414 if ver > BLOOM_VERSION:
415 log('Warning: ignoring too-new (v%d) bloom %r\n'
417 return self._init_failed()
419 self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
420 idxnamestr = str(self.map[16 + 2**self.bits:])
422 self.idxnames = idxnamestr.split('\0')
426 def _init_failed(self):
433 self.bits = self.entries = 0
436 return self.map and self.bits
442 if self.map and self.rwfile:
443 debug2("bloom: closing with %d entries\n" % self.entries)
444 self.map[12:16] = struct.pack('!I', self.entries)
447 self.rwfile.write(self.map)
450 self.rwfile.seek(16 + 2**self.bits)
452 self.rwfile.write('\0'.join(self.idxnames))
455 def pfalse_positive(self, additional=0):
456 n = self.entries + additional
459 return 100*(1-math.exp(-k*float(n)/m))**k
461 def add_idx(self, ix):
462 """Add the object to the filter, return current pfalse_positive."""
463 if not self.map: raise Exception, "Cannot add to closed bloom"
464 self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
465 self.idxnames.append(os.path.basename(ix.name))
467 def exists(self, sha):
468 """Return nonempty if the object probably exists in the bloom filter."""
469 global _total_searches, _total_steps
471 if not self.map: return None
472 found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
473 _total_steps += steps
477 def create(cls, name, expected, delaywrite=None, f=None, k=None):
478 """Create and return a bloom filter for `expected` entries."""
479 bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
480 k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
481 if bits > MAX_BLOOM_BITS[k]:
482 log('bloom: warning, max bits exceeded, non-optimal\n')
483 bits = MAX_BLOOM_BITS[k]
484 debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
485 f = f or open(name, 'w+b')
487 f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
488 assert(f.tell() == 16)
489 # NOTE: On some systems this will not extend+zerofill, but it does on
490 # darwin, linux, bsd and solaris.
491 f.truncate(16+2**bits)
493 if delaywrite != None and not delaywrite:
494 # tell it to expect very few objects, forcing a direct mmap
496 return cls(name, f=f, readwrite=True, expected=expected)
499 return int(self.entries)
503 """Wrapper which contains data from multiple index files.
504 Multiple index (.midx) files constitute a wrapper around index (.idx) files
505 and make it possible for bup to expand Git's indexing capabilities to vast
508 def __init__(self, filename):
510 self.force_keep = False
511 assert(filename.endswith('.midx'))
512 self.map = mmap_read(open(filename))
513 if str(self.map[0:4]) != 'MIDX':
514 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
515 self.force_keep = True
516 return self._init_failed()
517 ver = struct.unpack('!I', self.map[4:8])[0]
518 if ver < MIDX_VERSION:
519 log('Warning: ignoring old-style (v%d) midx %r\n'
521 self.force_keep = False # old stuff is boring
522 return self._init_failed()
523 if ver > MIDX_VERSION:
524 log('Warning: ignoring too-new (v%d) midx %r\n'
526 self.force_keep = True # new stuff is exciting
527 return self._init_failed()
529 self.bits = _helpers.firstword(self.map[8:12])
530 self.entries = 2**self.bits
531 self.fanout = buffer(self.map, 12, self.entries*4)
532 self.sha_ofs = 12 + self.entries*4
533 self.nsha = nsha = self._fanget(self.entries-1)
534 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
535 self.which_ofs = self.sha_ofs + 20*nsha
536 self.whichlist = buffer(self.map, self.which_ofs, nsha*4)
537 self.idxnames = str(self.map[self.which_ofs + 4*nsha:]).split('\0')
539 def _init_failed(self):
542 self.fanout = buffer('\0\0\0\0')
543 self.shatable = buffer('\0'*20)
546 def _fanget(self, i):
548 s = self.fanout[start:start+4]
549 return _helpers.firstword(s)
552 return str(self.shatable[i*20:(i+1)*20])
554 def _get_idx_i(self, i):
555 return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
557 def _get_idxname(self, i):
558 return self.idxnames[self._get_idx_i(i)]
560 def exists(self, hash, want_source=False):
561 """Return nonempty if the object exists in the index files."""
562 global _total_searches, _total_steps
565 el = extract_bits(want, self.bits)
567 start = self._fanget(el-1)
568 startv = el << (32-self.bits)
572 end = self._fanget(el)
573 endv = (el+1) << (32-self.bits)
574 _total_steps += 1 # lookup table is a step
575 hashv = _helpers.firstword(hash)
576 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
579 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
580 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
581 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
583 #print ' %08x' % self._num(v)
586 startv = _helpers.firstword(v)
589 endv = _helpers.firstword(v)
591 return want_source and self._get_idxname(mid) or True
595 for i in xrange(self._fanget(self.entries-1)):
596 yield buffer(self.shatable, i*20, 20)
599 return int(self._fanget(self.entries-1))
604 def __init__(self, dir):
606 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
611 self.do_bloom = False
618 assert(_mpi_count == 0)
621 return iter(idxmerge(self.packs))
624 return sum(len(pack) for pack in self.packs)
626 def exists(self, hash, want_source=False):
627 """Return nonempty if the object exists in the index files."""
628 global _total_searches
630 if hash in self.also:
632 if self.do_bloom and self.bloom is not None:
633 _total_searches -= 1 # will be incremented by bloom
634 if self.bloom.exists(hash):
635 self.do_bloom = False
638 for i in xrange(len(self.packs)):
640 _total_searches -= 1 # will be incremented by sub-pack
641 ix = p.exists(hash, want_source=want_source)
643 # reorder so most recently used packs are searched first
644 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
649 def refresh(self, skip_midx = False):
650 """Refresh the index list.
651 This method verifies if .midx files were superseded (e.g. all of its
652 contents are in another, bigger .midx file) and removes the superseded
655 If skip_midx is True, all work on .midx files will be skipped and .midx
656 files will be removed from the list.
658 The module-global variable 'ignore_midx' can force this function to
659 always act as if skip_midx was True.
661 self.bloom = None # Always reopen the bloom as it may have been relaced
662 self.do_bloom = False
663 skip_midx = skip_midx or ignore_midx
664 d = dict((p.name, p) for p in self.packs
665 if not skip_midx or not isinstance(p, PackMidx))
666 if os.path.exists(self.dir):
669 for ix in self.packs:
670 if isinstance(ix, PackMidx):
671 for name in ix.idxnames:
672 d[os.path.join(self.dir, name)] = ix
673 for full in glob.glob(os.path.join(self.dir,'*.midx')):
676 (mxd, mxf) = os.path.split(mx.name)
678 for n in mx.idxnames:
679 if not os.path.exists(os.path.join(mxd, n)):
680 log(('warning: index %s missing\n' +
681 ' used by %s\n') % (n, mxf))
688 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
691 for sub in ix.idxnames:
692 found = d.get(os.path.join(self.dir, sub))
693 if not found or isinstance(found, PackIdx):
694 # doesn't exist, or exists but not in a midx
699 for name in ix.idxnames:
700 d[os.path.join(self.dir, name)] = ix
701 elif not ix.force_keep:
702 debug1('midx: removing redundant: %s\n'
703 % os.path.basename(ix.name))
705 for full in glob.glob(os.path.join(self.dir,'*.idx')):
713 bfull = os.path.join(self.dir, 'bup.bloom')
714 if self.bloom is None and os.path.exists(bfull):
715 self.bloom = ShaBloom(bfull)
716 self.packs = list(set(d.values()))
717 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
718 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
722 debug1('PackIdxList: using %d index%s.\n'
723 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
726 """Insert an additional object in the list."""
730 def calc_hash(type, content):
731 """Calculate some content's hash in the Git fashion."""
732 header = '%s %d\0' % (type, len(content))
738 def _shalist_sort_key(ent):
739 (mode, name, id) = ent
740 if stat.S_ISDIR(int(mode, 8)):
746 def open_idx(filename):
747 if filename.endswith('.idx'):
748 f = open(filename, 'rb')
750 if header[0:4] == '\377tOc':
751 version = struct.unpack('!I', header[4:8])[0]
753 return PackIdxV2(filename, f)
755 raise GitError('%s: expected idx file version 2, got %d'
756 % (filename, version))
757 elif len(header) == 8 and header[0:4] < '\377tOc':
758 return PackIdxV1(filename, f)
760 raise GitError('%s: unrecognized idx file header' % filename)
761 elif filename.endswith('.midx'):
762 return PackMidx(filename)
764 raise GitError('idx filenames must end with .idx or .midx')
767 def idxmerge(idxlist, final_progress=True):
768 """Generate a list of all the objects reachable in a PackIdxList."""
769 def pfunc(count, total):
770 progress('Reading indexes: %.2f%% (%d/%d)\r'
771 % (count*100.0/total, count, total))
772 def pfinal(count, total):
774 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
775 return merge_iter(idxlist, 10024, pfunc, pfinal)
778 def _make_objcache():
779 return PackIdxList(repo('objects/pack'))
782 """Writes Git objects insid a pack file."""
783 def __init__(self, objcache_maker=_make_objcache):
789 self.objcache_maker = objcache_maker
797 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
798 self.file = os.fdopen(fd, 'w+b')
799 assert(name.endswith('.pack'))
800 self.filename = name[:-5]
801 self.file.write('PACK\0\0\0\2\0\0\0\0')
802 self.idx = list(list() for i in xrange(256))
804 def _raw_write(self, datalist, sha):
807 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
808 # the file never has a *partial* blob. So let's make sure it's
809 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
810 # to our hashsplit algorithm.) f.write() does its own buffering,
811 # but that's okay because we'll flush it in _end().
812 oneblob = ''.join(datalist)
816 raise GitError, e, sys.exc_info()[2]
818 crc = zlib.crc32(oneblob) & 0xffffffff
819 self._update_idx(sha, crc, nw)
824 def _update_idx(self, sha, crc, size):
827 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
829 def _write(self, sha, type, content):
833 sha = calc_hash(type, content)
834 size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
837 def breakpoint(self):
838 """Clear byte and object counts and return the last processed id."""
840 self.outbytes = self.count = 0
843 def _require_objcache(self):
844 if self.objcache is None and self.objcache_maker:
845 self.objcache = self.objcache_maker()
846 if self.objcache is None:
848 "PackWriter not opened or can't check exists w/o objcache")
850 def exists(self, id, want_source=False):
851 """Return non-empty if an object is found in the object cache."""
852 self._require_objcache()
853 return self.objcache.exists(id, want_source=want_source)
855 def maybe_write(self, type, content):
856 """Write an object to the pack file if not present and return its id."""
857 self._require_objcache()
858 sha = calc_hash(type, content)
859 if not self.exists(sha):
860 self._write(sha, type, content)
861 self.objcache.add(sha)
864 def new_blob(self, blob):
865 """Create a blob object in the pack with the supplied content."""
866 return self.maybe_write('blob', blob)
868 def new_tree(self, shalist):
869 """Create a tree object in the pack."""
870 shalist = sorted(shalist, key = _shalist_sort_key)
872 for (mode,name,bin) in shalist:
875 assert(mode[0] != '0')
877 assert(len(bin) == 20)
878 l.append('%s %s\0%s' % (mode,name,bin))
879 return self.maybe_write('tree', ''.join(l))
881 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
883 if tree: l.append('tree %s' % tree.encode('hex'))
884 if parent: l.append('parent %s' % parent.encode('hex'))
885 if author: l.append('author %s %s' % (author, _git_date(adate)))
886 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
889 return self.maybe_write('commit', '\n'.join(l))
891 def new_commit(self, parent, tree, date, msg):
892 """Create a commit object in the pack."""
893 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
894 commit = self._new_commit(tree, parent,
895 userline, date, userline, date,
900 """Remove the pack file from disk."""
906 os.unlink(self.filename + '.pack')
908 def _end(self, run_midx=True):
910 if not f: return None
916 # update object count
918 cp = struct.pack('!i', self.count)
922 # calculate the pack sha1sum
925 for b in chunkyreader(f):
927 packbin = sum.digest()
931 obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
933 nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
934 if os.path.exists(self.filename + '.map'):
935 os.unlink(self.filename + '.map')
936 os.rename(self.filename + '.pack', nameprefix + '.pack')
937 os.rename(self.filename + '.idx', nameprefix + '.idx')
940 auto_midx(repo('objects/pack'))
943 def close(self, run_midx=True):
944 """Close the pack file and move it to its definitive path."""
945 return self._end(run_midx=run_midx)
947 def _write_pack_idx_v2(self, filename, idx, packbin):
948 idx_f = open(filename, 'w+b')
949 idx_f.write('\377tOc\0\0\0\2')
951 ofs64_ofs = 8 + 4*256 + 28*self.count
952 idx_f.truncate(ofs64_ofs)
954 idx_map = mmap_readwrite(idx_f, close=False)
955 idx_f.seek(0, SEEK_END)
956 count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
957 assert(count == self.count)
963 b = idx_f.read(8 + 4*256)
966 obj_list_sum = Sha1()
967 for b in chunkyreader(idx_f, 20*self.count):
969 obj_list_sum.update(b)
970 namebase = obj_list_sum.hexdigest()
972 for b in chunkyreader(idx_f):
974 idx_f.write(idx_sum.digest())
981 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
985 os.environ['GIT_DIR'] = os.path.abspath(repo())
988 def list_refs(refname = None):
989 """Generate a list of tuples in the form (refname,hash).
990 If a ref name is specified, list only this particular ref.
992 argv = ['git', 'show-ref', '--']
995 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
996 out = p.stdout.read().strip()
997 rv = p.wait() # not fatal
1001 for d in out.split('\n'):
1002 (sha, name) = d.split(' ', 1)
1003 yield (name, sha.decode('hex'))
1006 def read_ref(refname):
1007 """Get the commit id of the most recent commit made on a given ref."""
1008 l = list(list_refs(refname))
1016 def rev_list(ref, count=None):
1017 """Generate a list of reachable commits in reverse chronological order.
1019 This generator walks through commits, from child to parent, that are
1020 reachable via the specified ref and yields a series of tuples of the form
1023 If count is a non-zero integer, limit the number of commits to "count"
1026 assert(not ref.startswith('-'))
1029 opts += ['-n', str(atoi(count))]
1030 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
1031 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
1033 for row in p.stdout:
1035 if s.startswith('commit '):
1036 commit = s[7:].decode('hex')
1039 yield (date, commit)
1040 rv = p.wait() # not fatal
1042 raise GitError, 'git rev-list returned error %d' % rv
1045 def rev_get_date(ref):
1046 """Get the date of the latest commit on the specified ref."""
1047 for (date, commit) in rev_list(ref, count=1):
1049 raise GitError, 'no such commit %r' % ref
1052 def rev_parse(committish):
1053 """Resolve the full hash for 'committish', if it exists.
1055 Should be roughly equivalent to 'git rev-parse'.
1057 Returns the hex value of the hash if it is found, None if 'committish' does
1058 not correspond to anything.
1060 head = read_ref(committish)
1062 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1065 pL = PackIdxList(repo('objects/pack'))
1067 if len(committish) == 40:
1069 hash = committish.decode('hex')
1079 def update_ref(refname, newval, oldval):
1080 """Change the commit pointed to by a branch."""
1083 assert(refname.startswith('refs/heads/'))
1084 p = subprocess.Popen(['git', 'update-ref', refname,
1085 newval.encode('hex'), oldval.encode('hex')],
1086 preexec_fn = _gitenv)
1087 _git_wait('git update-ref', p)
1090 def guess_repo(path=None):
1091 """Set the path value in the global variable "repodir".
1092 This makes bup look for an existing bup repository, but not fail if a
1093 repository doesn't exist. Usually, if you are interacting with a bup
1094 repository, you would not be calling this function but using
1095 check_repo_or_die().
1101 repodir = os.environ.get('BUP_DIR')
1103 repodir = os.path.expanduser('~/.bup')
1106 def init_repo(path=None):
1107 """Create the Git bare repository for bup in a given path."""
1109 d = repo() # appends a / to the path
1110 parent = os.path.dirname(os.path.dirname(d))
1111 if parent and not os.path.exists(parent):
1112 raise GitError('parent directory "%s" does not exist\n' % parent)
1113 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1114 raise GitError('"%d" exists but is not a directory\n' % d)
1115 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1116 preexec_fn = _gitenv)
1117 _git_wait('git init', p)
1118 # Force the index version configuration in order to ensure bup works
1119 # regardless of the version of the installed Git binary.
1120 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1121 stdout=sys.stderr, preexec_fn = _gitenv)
1122 _git_wait('git config', p)
1125 def check_repo_or_die(path=None):
1126 """Make sure a bup repository exists, and abort if not.
1127 If the path to a particular repository was not specified, this function
1128 initializes the default repository automatically.
1131 if not os.path.isdir(repo('objects/pack/.')):
1132 if repodir == home_repodir:
1135 log('error: %r is not a bup/git repository\n' % repo())
1140 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1142 while ofs < len(buf):
1143 z = buf[ofs:].find('\0')
1145 spl = buf[ofs:ofs+z].split(' ', 1)
1146 assert(len(spl) == 2)
1147 sha = buf[ofs+z+1:ofs+z+1+20]
1149 yield (spl[0], spl[1], sha)
1154 """Get Git's version and ensure a usable version is installed.
1156 The returned version is formatted as an ordered tuple with each position
1157 representing a digit in the version tag. For example, the following tuple
1158 would represent version 1.6.6.9:
1160 ('1', '6', '6', '9')
1164 p = subprocess.Popen(['git', '--version'],
1165 stdout=subprocess.PIPE)
1166 gvs = p.stdout.read()
1167 _git_wait('git --version', p)
1168 m = re.match(r'git version (\S+.\S+)', gvs)
1170 raise GitError('git --version weird output: %r' % gvs)
1171 _ver = tuple(m.group(1).split('.'))
1172 needed = ('1','5', '3', '1')
1174 raise GitError('git version %s or higher is required; you have %s'
1175 % ('.'.join(needed), '.'.join(_ver)))
1179 def _git_wait(cmd, p):
1182 raise GitError('%s returned %d' % (cmd, rv))
1185 def _git_capture(argv):
1186 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1188 _git_wait(repr(argv), p)
1192 class _AbortableIter:
1193 def __init__(self, it, onabort = None):
1195 self.onabort = onabort
1203 return self.it.next()
1204 except StopIteration, e:
1212 """Abort iteration and call the abortion callback, if needed."""
1224 """Link to 'git cat-file' that is used to retrieve blob data."""
1227 wanted = ('1','5','6')
1230 log('warning: git version < %s; bup will be slow.\n'
1233 self.get = self._slow_get
1235 self.p = self.inprogress = None
1236 self.get = self._fast_get
1240 self.p.stdout.close()
1241 self.p.stdin.close()
1243 self.inprogress = None
1247 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1248 stdin=subprocess.PIPE,
1249 stdout=subprocess.PIPE,
1252 preexec_fn = _gitenv)
1254 def _fast_get(self, id):
1255 if not self.p or self.p.poll() != None:
1258 assert(self.p.poll() == None)
1260 log('_fast_get: opening %r while %r is open'
1261 % (id, self.inprogress))
1262 assert(not self.inprogress)
1263 assert(id.find('\n') < 0)
1264 assert(id.find('\r') < 0)
1265 assert(not id.startswith('-'))
1266 self.inprogress = id
1267 self.p.stdin.write('%s\n' % id)
1268 self.p.stdin.flush()
1269 hdr = self.p.stdout.readline()
1270 if hdr.endswith(' missing\n'):
1271 self.inprogress = None
1272 raise KeyError('blob %r is missing' % id)
1273 spl = hdr.split(' ')
1274 if len(spl) != 3 or len(spl[0]) != 40:
1275 raise GitError('expected blob, got %r' % spl)
1276 (hex, type, size) = spl
1278 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1279 onabort = self._abort)
1284 assert(self.p.stdout.readline() == '\n')
1285 self.inprogress = None
1286 except Exception, e:
1290 def _slow_get(self, id):
1291 assert(id.find('\n') < 0)
1292 assert(id.find('\r') < 0)
1293 assert(id[0] != '-')
1294 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1297 p = subprocess.Popen(['git', 'cat-file', type, id],
1298 stdout=subprocess.PIPE,
1299 preexec_fn = _gitenv)
1300 for blob in chunkyreader(p.stdout):
1302 _git_wait('git cat-file', p)
1304 def _join(self, it):
1309 elif type == 'tree':
1310 treefile = ''.join(it)
1311 for (mode, name, sha) in treeparse(treefile):
1312 for blob in self.join(sha.encode('hex')):
1314 elif type == 'commit':
1315 treeline = ''.join(it).split('\n')[0]
1316 assert(treeline.startswith('tree '))
1317 for blob in self.join(treeline[5:]):
1320 raise GitError('invalid object type %r: expected blob/tree/commit'
1324 """Generate a list of the content of all blobs that can be reached
1325 from an object. The hash given in 'id' must point to a blob, a tree
1326 or a commit. The content of all blobs that can be seen from trees or
1327 commits will be added to the list.
1330 for d in self._join(self.get(id)):
1332 except StopIteration:
1336 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1338 for (n,c) in list_refs():
1339 if n.startswith('refs/tags/'):
1344 tags[c].append(name) # more than one tag can point at 'c'