1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
6 from bup.helpers import *
7 from bup import _helpers, path
10 SEEK_END=2 # os.SEEK_END is not defined in python 2.4
12 """Discussion of bloom constants for bup:
14 There are four basic things to consider when building a bloom filter:
15 The size, in bits, of the filter
16 The capacity, in entries, of the filter
17 The probability of a false positive that is tolerable
18 The number of bits readily available to use for addresing filter bits
20 There is one major tunable that is not directly related to the above:
21 k: the number of bits set in the filter per entry
23 Here's a wall of numbers showing the relationship between k; the ratio between
24 the filter size in bits and the entries in the filter; and pfalse_positive:
26 mn|k=3 |k=4 |k=5 |k=6 |k=7 |k=8 |k=9 |k=10 |k=11
27 8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
28 9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
29 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
30 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
31 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
32 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
33 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
34 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
35 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
36 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
37 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
38 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
39 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
40 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
41 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
42 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
43 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
44 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
45 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
46 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
47 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
48 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
49 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
50 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
51 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
53 Here's a table showing available repository size for a given pfalse_positive
54 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
55 filter and 8192bytes per object):
57 pfalse|obj k=4 |cap k=4 |obj k=5 |cap k=5 |obj k=6 |cap k=6
58 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
59 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
60 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
62 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
65 filter size scales linearly with repository size for a given k and pfalse.
67 Here's a table of filter sizes for a 1 TiB repository:
69 pfalse| k=3 | k=4 | k=5 | k=6
70 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
71 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
72 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
75 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
76 per lookup will be worse than the two required for midx.
77 * We want the pfalse_positive to be low enough that the cost of sometimes
78 faulting on the midx doesn't overcome the benefit of the bloom filter.
79 * We have readily available 160 bits for addressing the filter.
80 * We want to be able to have a single bloom address entire repositories of
83 Based on these parameters, a combination of k=4 and k=5 provides the behavior
84 that bup needs. As such, I've implemented bloom addressing, adding and
85 checking functions in C for these two values. Because k=5 requires less space
86 and gives better overall pfalse_positive perofrmance, it is preferred if a
87 table with k=5 can represent the repository.
89 None of this tells us what max_pfalse_positive to choose.
91 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
94 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
95 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
96 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
100 home_repodir = os.path.expanduser('~/.bup')
103 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
104 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
110 class GitError(Exception):
115 """Get the path to the git repository or one of its subdirectories."""
118 raise GitError('You should call check_repo_or_die()')
120 # If there's a .git subdirectory, then the actual repo is in there.
121 gd = os.path.join(repodir, '.git')
122 if os.path.exists(gd):
125 return os.path.join(repodir, sub)
128 def auto_midx(objdir):
129 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
131 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
133 # make sure 'args' gets printed to help with debugging
134 add_error('%r: exception: %s' % (args, e))
137 add_error('%r: returned %d' % (args, rv))
139 args = [path.exe(), 'bloom', '--dir', objdir]
141 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
143 # make sure 'args' gets printed to help with debugging
144 add_error('%r: exception: %s' % (args, e))
147 add_error('%r: returned %d' % (args, rv))
150 def mangle_name(name, mode, gitmode):
151 """Mangle a file name to present an abstract name for segmented files.
152 Mangled file names will have the ".bup" extension added to them. If a
153 file's name already ends with ".bup", a ".bupl" extension is added to
154 disambiguate normal files from semgmented ones.
156 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
158 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
159 return name + '.bupl'
164 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
165 def demangle_name(name):
166 """Remove name mangling from a file name, if necessary.
168 The return value is a tuple (demangled_filename,mode), where mode is one of
171 * BUP_NORMAL : files that should be read as-is from the repository
172 * BUP_CHUNKED : files that were chunked and need to be assembled
174 For more information on the name mangling algorythm, see mangle_name()
176 if name.endswith('.bupl'):
177 return (name[:-5], BUP_NORMAL)
178 elif name.endswith('.bup'):
179 return (name[:-4], BUP_CHUNKED)
181 return (name, BUP_NORMAL)
184 def _encode_packobj(type, content):
187 szbits = (sz & 0x0f) | (_typemap[type]<<4)
190 if sz: szbits |= 0x80
196 z = zlib.compressobj(1)
198 yield z.compress(content)
202 def _encode_looseobj(type, content):
203 z = zlib.compressobj(1)
204 yield z.compress('%s %d\0' % (type, len(content)))
205 yield z.compress(content)
209 def _decode_looseobj(buf):
211 s = zlib.decompress(buf)
218 assert(type in _typemap)
219 assert(sz == len(content))
220 return (type, content)
223 def _decode_packobj(buf):
226 type = _typermap[(c & 0x70) >> 4]
233 sz |= (c & 0x7f) << shift
237 return (type, zlib.decompress(buf[i+1:]))
244 def find_offset(self, hash):
245 """Get the offset of an object inside the index file."""
246 idx = self._idx_from_hash(hash)
248 return self._ofs_from_idx(idx)
251 def exists(self, hash, want_source=False):
252 """Return nonempty if the object exists in this index."""
253 if hash and (self._idx_from_hash(hash) != None):
254 return want_source and os.path.basename(self.name) or True
258 return int(self.fanout[255])
260 def _idx_from_hash(self, hash):
261 global _total_searches, _total_steps
263 assert(len(hash) == 20)
265 start = self.fanout[b1-1] # range -1..254
266 end = self.fanout[b1] # range 0..255
268 _total_steps += 1 # lookup table is a step
271 mid = start + (end-start)/2
272 v = self._idx_to_hash(mid)
282 class PackIdxV1(PackIdx):
283 """Object representation of a Git pack index (version 1) file."""
284 def __init__(self, filename, f):
286 self.idxnames = [self.name]
287 self.map = mmap_read(f)
288 self.fanout = list(struct.unpack('!256I',
289 str(buffer(self.map, 0, 256*4))))
290 self.fanout.append(0) # entry "-1"
291 nsha = self.fanout[255]
293 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
295 def _ofs_from_idx(self, idx):
296 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
298 def _idx_to_hash(self, idx):
299 return str(self.shatable[idx*24+4 : idx*24+24])
302 for i in xrange(self.fanout[255]):
303 yield buffer(self.map, 256*4 + 24*i + 4, 20)
306 class PackIdxV2(PackIdx):
307 """Object representation of a Git pack index (version 2) file."""
308 def __init__(self, filename, f):
310 self.idxnames = [self.name]
311 self.map = mmap_read(f)
312 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
313 self.fanout = list(struct.unpack('!256I',
314 str(buffer(self.map, 8, 256*4))))
315 self.fanout.append(0) # entry "-1"
316 nsha = self.fanout[255]
317 self.sha_ofs = 8 + 256*4
318 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
319 self.ofstable = buffer(self.map,
320 self.sha_ofs + nsha*20 + nsha*4,
322 self.ofs64table = buffer(self.map,
323 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
325 def _ofs_from_idx(self, idx):
326 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
328 idx64 = ofs & 0x7fffffff
329 ofs = struct.unpack('!Q',
330 str(buffer(self.ofs64table, idx64*8, 8)))[0]
333 def _idx_to_hash(self, idx):
334 return str(self.shatable[idx*20:(idx+1)*20])
337 for i in xrange(self.fanout[255]):
338 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
341 extract_bits = _helpers.extract_bits
343 bloom_contains = _helpers.bloom_contains
344 bloom_add = _helpers.bloom_add
348 """Wrapper which contains data from multiple index files.
350 def __init__(self, filename, f=None, readwrite=False, expected=-1):
354 assert(filename.endswith('.bloom'))
357 self.rwfile = f = f or open(filename, 'r+b')
360 # Decide if we want to mmap() the pages as writable ('immediate'
361 # write) or else map them privately for later writing back to
362 # the file ('delayed' write). A bloom table's write access
363 # pattern is such that we dirty almost all the pages after adding
364 # very few entries. But the table is so big that dirtying
365 # *all* the pages often exceeds Linux's default
366 # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
367 # thus causing it to start flushing the table before we're
368 # finished... even though there's more than enough space to
369 # store the bloom table in RAM.
371 # To work around that behaviour, if we calculate that we'll
372 # probably end up touching the whole table anyway (at least
373 # one bit flipped per memory page), let's use a "private" mmap,
374 # which defeats Linux's ability to flush it to disk. Then we'll
375 # flush it as one big lump during close().
376 pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
377 self.delaywrite = expected > pages
378 debug1('bloom: delaywrite=%r\n' % self.delaywrite)
380 self.map = mmap_readwrite_private(self.rwfile, close=False)
382 self.map = mmap_readwrite(self.rwfile, close=False)
385 f = f or open(filename, 'rb')
386 self.map = mmap_read(f)
387 got = str(self.map[0:4])
389 log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
390 return self._init_failed()
391 ver = struct.unpack('!I', self.map[4:8])[0]
392 if ver < BLOOM_VERSION:
393 log('Warning: ignoring old-style (v%d) bloom %r\n'
395 return self._init_failed()
396 if ver > BLOOM_VERSION:
397 log('Warning: ignoring too-new (v%d) bloom %r\n'
399 return self._init_failed()
401 self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
402 idxnamestr = str(self.map[16 + 2**self.bits:])
404 self.idxnames = idxnamestr.split('\0')
408 def _init_failed(self):
415 self.bits = self.entries = 0
418 return self.map and self.bits
424 if self.map and self.rwfile:
425 debug2("bloom: closing with %d entries\n" % self.entries)
426 self.map[12:16] = struct.pack('!I', self.entries)
429 self.rwfile.write(self.map)
432 self.rwfile.seek(16 + 2**self.bits)
434 self.rwfile.write('\0'.join(self.idxnames))
437 def pfalse_positive(self, additional=0):
438 n = self.entries + additional
441 return 100*(1-math.exp(-k*float(n)/m))**k
443 def add_idx(self, ix):
444 """Add the object to the filter, return current pfalse_positive."""
445 if not self.map: raise Exception, "Cannot add to closed bloom"
446 self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
447 self.idxnames.append(os.path.basename(ix.name))
449 def exists(self, sha):
450 """Return nonempty if the object probably exists in the bloom filter."""
451 global _total_searches, _total_steps
453 if not self.map: return None
454 found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
455 _total_steps += steps
459 def create(cls, name, expected, delaywrite=None, f=None, k=None):
460 """Create and return a bloom filter for `expected` entries."""
461 bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
462 k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
463 if bits > MAX_BLOOM_BITS[k]:
464 log('bloom: warning, max bits exceeded, non-optimal\n')
465 bits = MAX_BLOOM_BITS[k]
466 debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
467 f = f or open(name, 'w+b')
469 f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
470 assert(f.tell() == 16)
471 # NOTE: On some systems this will not extend+zerofill, but it does on
472 # darwin, linux, bsd and solaris.
473 f.truncate(16+2**bits)
475 if delaywrite != None and not delaywrite:
476 # tell it to expect very few objects, forcing a direct mmap
478 return cls(name, f=f, readwrite=True, expected=expected)
481 return int(self.entries)
485 """Wrapper which contains data from multiple index files.
486 Multiple index (.midx) files constitute a wrapper around index (.idx) files
487 and make it possible for bup to expand Git's indexing capabilities to vast
490 def __init__(self, filename):
492 self.force_keep = False
493 assert(filename.endswith('.midx'))
494 self.map = mmap_read(open(filename))
495 if str(self.map[0:4]) != 'MIDX':
496 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
497 self.force_keep = True
498 return self._init_failed()
499 ver = struct.unpack('!I', self.map[4:8])[0]
500 if ver < MIDX_VERSION:
501 log('Warning: ignoring old-style (v%d) midx %r\n'
503 self.force_keep = False # old stuff is boring
504 return self._init_failed()
505 if ver > MIDX_VERSION:
506 log('Warning: ignoring too-new (v%d) midx %r\n'
508 self.force_keep = True # new stuff is exciting
509 return self._init_failed()
511 self.bits = _helpers.firstword(self.map[8:12])
512 self.entries = 2**self.bits
513 self.fanout = buffer(self.map, 12, self.entries*4)
514 self.sha_ofs = 12 + self.entries*4
515 self.nsha = nsha = self._fanget(self.entries-1)
516 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
517 self.which_ofs = self.sha_ofs + 20*nsha
518 self.whichlist = buffer(self.map, self.which_ofs, nsha*4)
519 self.idxnames = str(self.map[self.which_ofs + 4*nsha:]).split('\0')
521 def _init_failed(self):
524 self.fanout = buffer('\0\0\0\0')
525 self.shatable = buffer('\0'*20)
528 def _fanget(self, i):
530 s = self.fanout[start:start+4]
531 return _helpers.firstword(s)
534 return str(self.shatable[i*20:(i+1)*20])
536 def _get_idx_i(self, i):
537 return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
539 def _get_idxname(self, i):
540 return self.idxnames[self._get_idx_i(i)]
542 def exists(self, hash, want_source=False):
543 """Return nonempty if the object exists in the index files."""
544 global _total_searches, _total_steps
547 el = extract_bits(want, self.bits)
549 start = self._fanget(el-1)
550 startv = el << (32-self.bits)
554 end = self._fanget(el)
555 endv = (el+1) << (32-self.bits)
556 _total_steps += 1 # lookup table is a step
557 hashv = _helpers.firstword(hash)
558 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
561 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
562 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
563 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
565 #print ' %08x' % self._num(v)
568 startv = _helpers.firstword(v)
571 endv = _helpers.firstword(v)
573 return want_source and self._get_idxname(mid) or True
577 for i in xrange(self._fanget(self.entries-1)):
578 yield buffer(self.shatable, i*20, 20)
581 return int(self._fanget(self.entries-1))
586 def __init__(self, dir):
588 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
593 self.do_bloom = False
600 assert(_mpi_count == 0)
603 return iter(idxmerge(self.packs))
606 return sum(len(pack) for pack in self.packs)
608 def exists(self, hash, want_source=False):
609 """Return nonempty if the object exists in the index files."""
610 global _total_searches
612 if hash in self.also:
614 if self.do_bloom and self.bloom is not None:
615 _total_searches -= 1 # will be incremented by bloom
616 if self.bloom.exists(hash):
617 self.do_bloom = False
620 for i in xrange(len(self.packs)):
622 _total_searches -= 1 # will be incremented by sub-pack
623 ix = p.exists(hash, want_source=want_source)
625 # reorder so most recently used packs are searched first
626 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
631 def refresh(self, skip_midx = False):
632 """Refresh the index list.
633 This method verifies if .midx files were superseded (e.g. all of its
634 contents are in another, bigger .midx file) and removes the superseded
637 If skip_midx is True, all work on .midx files will be skipped and .midx
638 files will be removed from the list.
640 The module-global variable 'ignore_midx' can force this function to
641 always act as if skip_midx was True.
643 self.bloom = None # Always reopen the bloom as it may have been relaced
644 self.do_bloom = False
645 skip_midx = skip_midx or ignore_midx
646 d = dict((p.name, p) for p in self.packs
647 if not skip_midx or not isinstance(p, PackMidx))
648 if os.path.exists(self.dir):
651 for ix in self.packs:
652 if isinstance(ix, PackMidx):
653 for name in ix.idxnames:
654 d[os.path.join(self.dir, name)] = ix
655 for full in glob.glob(os.path.join(self.dir,'*.midx')):
658 (mxd, mxf) = os.path.split(mx.name)
660 for n in mx.idxnames:
661 if not os.path.exists(os.path.join(mxd, n)):
662 log(('warning: index %s missing\n' +
663 ' used by %s\n') % (n, mxf))
670 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
673 for sub in ix.idxnames:
674 found = d.get(os.path.join(self.dir, sub))
675 if not found or isinstance(found, PackIdx):
676 # doesn't exist, or exists but not in a midx
681 for name in ix.idxnames:
682 d[os.path.join(self.dir, name)] = ix
683 elif not ix.force_keep:
684 debug1('midx: removing redundant: %s\n'
685 % os.path.basename(ix.name))
687 for full in glob.glob(os.path.join(self.dir,'*.idx')):
695 bfull = os.path.join(self.dir, 'bup.bloom')
696 if self.bloom is None and os.path.exists(bfull):
697 self.bloom = ShaBloom(bfull)
698 self.packs = list(set(d.values()))
699 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
700 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
704 debug1('PackIdxList: using %d index%s.\n'
705 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
708 """Insert an additional object in the list."""
712 def calc_hash(type, content):
713 """Calculate some content's hash in the Git fashion."""
714 header = '%s %d\0' % (type, len(content))
720 def _shalist_sort_key(ent):
721 (mode, name, id) = ent
722 if stat.S_ISDIR(int(mode, 8)):
728 def open_idx(filename):
729 if filename.endswith('.idx'):
730 f = open(filename, 'rb')
732 if header[0:4] == '\377tOc':
733 version = struct.unpack('!I', header[4:8])[0]
735 return PackIdxV2(filename, f)
737 raise GitError('%s: expected idx file version 2, got %d'
738 % (filename, version))
739 elif len(header) == 8 and header[0:4] < '\377tOc':
740 return PackIdxV1(filename, f)
742 raise GitError('%s: unrecognized idx file header' % filename)
743 elif filename.endswith('.midx'):
744 return PackMidx(filename)
746 raise GitError('idx filenames must end with .idx or .midx')
749 def idxmerge(idxlist, final_progress=True):
750 """Generate a list of all the objects reachable in a PackIdxList."""
751 def pfunc(count, total):
752 progress('Reading indexes: %.2f%% (%d/%d)\r'
753 % (count*100.0/total, count, total))
754 def pfinal(count, total):
756 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
757 return merge_iter(idxlist, 10024, pfunc, pfinal)
760 def _make_objcache():
761 return PackIdxList(repo('objects/pack'))
764 """Writes Git objects insid a pack file."""
765 def __init__(self, objcache_maker=_make_objcache):
771 self.objcache_maker = objcache_maker
779 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
780 self.file = os.fdopen(fd, 'w+b')
781 assert(name.endswith('.pack'))
782 self.filename = name[:-5]
783 self.file.write('PACK\0\0\0\2\0\0\0\0')
784 self.idx = list(list() for i in xrange(256))
786 def _raw_write(self, datalist, sha):
789 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
790 # the file never has a *partial* blob. So let's make sure it's
791 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
792 # to our hashsplit algorithm.) f.write() does its own buffering,
793 # but that's okay because we'll flush it in _end().
794 oneblob = ''.join(datalist)
798 raise GitError, e, sys.exc_info()[2]
800 crc = zlib.crc32(oneblob) & 0xffffffff
801 self._update_idx(sha, crc, nw)
806 def _update_idx(self, sha, crc, size):
809 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
811 def _write(self, sha, type, content):
815 sha = calc_hash(type, content)
816 size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
819 def breakpoint(self):
820 """Clear byte and object counts and return the last processed id."""
822 self.outbytes = self.count = 0
825 def _require_objcache(self):
826 if self.objcache is None and self.objcache_maker:
827 self.objcache = self.objcache_maker()
828 if self.objcache is None:
830 "PackWriter not opened or can't check exists w/o objcache")
832 def exists(self, id, want_source=False):
833 """Return non-empty if an object is found in the object cache."""
834 self._require_objcache()
835 return self.objcache.exists(id, want_source=want_source)
837 def maybe_write(self, type, content):
838 """Write an object to the pack file if not present and return its id."""
839 self._require_objcache()
840 sha = calc_hash(type, content)
841 if not self.exists(sha):
842 self._write(sha, type, content)
843 self.objcache.add(sha)
846 def new_blob(self, blob):
847 """Create a blob object in the pack with the supplied content."""
848 return self.maybe_write('blob', blob)
850 def new_tree(self, shalist):
851 """Create a tree object in the pack."""
852 shalist = sorted(shalist, key = _shalist_sort_key)
854 for (mode,name,bin) in shalist:
857 assert(mode[0] != '0')
859 assert(len(bin) == 20)
860 l.append('%s %s\0%s' % (mode,name,bin))
861 return self.maybe_write('tree', ''.join(l))
863 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
865 if tree: l.append('tree %s' % tree.encode('hex'))
866 if parent: l.append('parent %s' % parent.encode('hex'))
867 if author: l.append('author %s %s' % (author, _git_date(adate)))
868 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
871 return self.maybe_write('commit', '\n'.join(l))
873 def new_commit(self, parent, tree, date, msg):
874 """Create a commit object in the pack."""
875 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
876 commit = self._new_commit(tree, parent,
877 userline, date, userline, date,
882 """Remove the pack file from disk."""
888 os.unlink(self.filename + '.pack')
890 def _end(self, run_midx=True):
892 if not f: return None
898 # update object count
900 cp = struct.pack('!i', self.count)
904 # calculate the pack sha1sum
907 for b in chunkyreader(f):
909 packbin = sum.digest()
913 obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
915 nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
916 if os.path.exists(self.filename + '.map'):
917 os.unlink(self.filename + '.map')
918 os.rename(self.filename + '.pack', nameprefix + '.pack')
919 os.rename(self.filename + '.idx', nameprefix + '.idx')
922 auto_midx(repo('objects/pack'))
925 def close(self, run_midx=True):
926 """Close the pack file and move it to its definitive path."""
927 return self._end(run_midx=run_midx)
929 def _write_pack_idx_v2(self, filename, idx, packbin):
930 idx_f = open(filename, 'w+b')
931 idx_f.write('\377tOc\0\0\0\2')
933 ofs64_ofs = 8 + 4*256 + 28*self.count
934 idx_f.truncate(ofs64_ofs)
936 idx_map = mmap_readwrite(idx_f, close=False)
937 idx_f.seek(0, SEEK_END)
938 count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
939 assert(count == self.count)
945 b = idx_f.read(8 + 4*256)
948 obj_list_sum = Sha1()
949 for b in chunkyreader(idx_f, 20*self.count):
951 obj_list_sum.update(b)
952 namebase = obj_list_sum.hexdigest()
954 for b in chunkyreader(idx_f):
956 idx_f.write(idx_sum.digest())
963 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
967 os.environ['GIT_DIR'] = os.path.abspath(repo())
970 def list_refs(refname = None):
971 """Generate a list of tuples in the form (refname,hash).
972 If a ref name is specified, list only this particular ref.
974 argv = ['git', 'show-ref', '--']
977 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
978 out = p.stdout.read().strip()
979 rv = p.wait() # not fatal
983 for d in out.split('\n'):
984 (sha, name) = d.split(' ', 1)
985 yield (name, sha.decode('hex'))
988 def read_ref(refname):
989 """Get the commit id of the most recent commit made on a given ref."""
990 l = list(list_refs(refname))
998 def rev_list(ref, count=None):
999 """Generate a list of reachable commits in reverse chronological order.
1001 This generator walks through commits, from child to parent, that are
1002 reachable via the specified ref and yields a series of tuples of the form
1005 If count is a non-zero integer, limit the number of commits to "count"
1008 assert(not ref.startswith('-'))
1011 opts += ['-n', str(atoi(count))]
1012 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
1013 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
1015 for row in p.stdout:
1017 if s.startswith('commit '):
1018 commit = s[7:].decode('hex')
1021 yield (date, commit)
1022 rv = p.wait() # not fatal
1024 raise GitError, 'git rev-list returned error %d' % rv
1027 def rev_get_date(ref):
1028 """Get the date of the latest commit on the specified ref."""
1029 for (date, commit) in rev_list(ref, count=1):
1031 raise GitError, 'no such commit %r' % ref
1034 def rev_parse(committish):
1035 """Resolve the full hash for 'committish', if it exists.
1037 Should be roughly equivalent to 'git rev-parse'.
1039 Returns the hex value of the hash if it is found, None if 'committish' does
1040 not correspond to anything.
1042 head = read_ref(committish)
1044 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1047 pL = PackIdxList(repo('objects/pack'))
1049 if len(committish) == 40:
1051 hash = committish.decode('hex')
1061 def update_ref(refname, newval, oldval):
1062 """Change the commit pointed to by a branch."""
1065 assert(refname.startswith('refs/heads/'))
1066 p = subprocess.Popen(['git', 'update-ref', refname,
1067 newval.encode('hex'), oldval.encode('hex')],
1068 preexec_fn = _gitenv)
1069 _git_wait('git update-ref', p)
1072 def guess_repo(path=None):
1073 """Set the path value in the global variable "repodir".
1074 This makes bup look for an existing bup repository, but not fail if a
1075 repository doesn't exist. Usually, if you are interacting with a bup
1076 repository, you would not be calling this function but using
1077 check_repo_or_die().
1083 repodir = os.environ.get('BUP_DIR')
1085 repodir = os.path.expanduser('~/.bup')
1088 def init_repo(path=None):
1089 """Create the Git bare repository for bup in a given path."""
1091 d = repo() # appends a / to the path
1092 parent = os.path.dirname(os.path.dirname(d))
1093 if parent and not os.path.exists(parent):
1094 raise GitError('parent directory "%s" does not exist\n' % parent)
1095 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1096 raise GitError('"%d" exists but is not a directory\n' % d)
1097 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1098 preexec_fn = _gitenv)
1099 _git_wait('git init', p)
1100 # Force the index version configuration in order to ensure bup works
1101 # regardless of the version of the installed Git binary.
1102 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1103 stdout=sys.stderr, preexec_fn = _gitenv)
1104 _git_wait('git config', p)
1107 def check_repo_or_die(path=None):
1108 """Make sure a bup repository exists, and abort if not.
1109 If the path to a particular repository was not specified, this function
1110 initializes the default repository automatically.
1113 if not os.path.isdir(repo('objects/pack/.')):
1114 if repodir == home_repodir:
1117 log('error: %r is not a bup/git repository\n' % repo())
1122 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1124 while ofs < len(buf):
1125 z = buf[ofs:].find('\0')
1127 spl = buf[ofs:ofs+z].split(' ', 1)
1128 assert(len(spl) == 2)
1129 sha = buf[ofs+z+1:ofs+z+1+20]
1131 yield (spl[0], spl[1], sha)
1136 """Get Git's version and ensure a usable version is installed.
1138 The returned version is formatted as an ordered tuple with each position
1139 representing a digit in the version tag. For example, the following tuple
1140 would represent version 1.6.6.9:
1142 ('1', '6', '6', '9')
1146 p = subprocess.Popen(['git', '--version'],
1147 stdout=subprocess.PIPE)
1148 gvs = p.stdout.read()
1149 _git_wait('git --version', p)
1150 m = re.match(r'git version (\S+.\S+)', gvs)
1152 raise GitError('git --version weird output: %r' % gvs)
1153 _ver = tuple(m.group(1).split('.'))
1154 needed = ('1','5', '3', '1')
1156 raise GitError('git version %s or higher is required; you have %s'
1157 % ('.'.join(needed), '.'.join(_ver)))
1161 def _git_wait(cmd, p):
1164 raise GitError('%s returned %d' % (cmd, rv))
1167 def _git_capture(argv):
1168 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1170 _git_wait(repr(argv), p)
1174 class _AbortableIter:
1175 def __init__(self, it, onabort = None):
1177 self.onabort = onabort
1185 return self.it.next()
1186 except StopIteration, e:
1194 """Abort iteration and call the abortion callback, if needed."""
1206 """Link to 'git cat-file' that is used to retrieve blob data."""
1209 wanted = ('1','5','6')
1212 log('warning: git version < %s; bup will be slow.\n'
1215 self.get = self._slow_get
1217 self.p = self.inprogress = None
1218 self.get = self._fast_get
1222 self.p.stdout.close()
1223 self.p.stdin.close()
1225 self.inprogress = None
1229 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1230 stdin=subprocess.PIPE,
1231 stdout=subprocess.PIPE,
1234 preexec_fn = _gitenv)
1236 def _fast_get(self, id):
1237 if not self.p or self.p.poll() != None:
1240 assert(self.p.poll() == None)
1242 log('_fast_get: opening %r while %r is open'
1243 % (id, self.inprogress))
1244 assert(not self.inprogress)
1245 assert(id.find('\n') < 0)
1246 assert(id.find('\r') < 0)
1247 assert(not id.startswith('-'))
1248 self.inprogress = id
1249 self.p.stdin.write('%s\n' % id)
1250 self.p.stdin.flush()
1251 hdr = self.p.stdout.readline()
1252 if hdr.endswith(' missing\n'):
1253 self.inprogress = None
1254 raise KeyError('blob %r is missing' % id)
1255 spl = hdr.split(' ')
1256 if len(spl) != 3 or len(spl[0]) != 40:
1257 raise GitError('expected blob, got %r' % spl)
1258 (hex, type, size) = spl
1260 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1261 onabort = self._abort)
1266 assert(self.p.stdout.readline() == '\n')
1267 self.inprogress = None
1268 except Exception, e:
1272 def _slow_get(self, id):
1273 assert(id.find('\n') < 0)
1274 assert(id.find('\r') < 0)
1275 assert(id[0] != '-')
1276 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1279 p = subprocess.Popen(['git', 'cat-file', type, id],
1280 stdout=subprocess.PIPE,
1281 preexec_fn = _gitenv)
1282 for blob in chunkyreader(p.stdout):
1284 _git_wait('git cat-file', p)
1286 def _join(self, it):
1291 elif type == 'tree':
1292 treefile = ''.join(it)
1293 for (mode, name, sha) in treeparse(treefile):
1294 for blob in self.join(sha.encode('hex')):
1296 elif type == 'commit':
1297 treeline = ''.join(it).split('\n')[0]
1298 assert(treeline.startswith('tree '))
1299 for blob in self.join(treeline[5:]):
1302 raise GitError('invalid object type %r: expected blob/tree/commit'
1306 """Generate a list of the content of all blobs that can be reached
1307 from an object. The hash given in 'id' must point to a blob, a tree
1308 or a commit. The content of all blobs that can be seen from trees or
1309 commits will be added to the list.
1312 for d in self._join(self.get(id)):
1314 except StopIteration:
1318 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1320 for (n,c) in list_refs():
1321 if n.startswith('refs/tags/'):
1326 tags[c].append(name) # more than one tag can point at 'c'