1 """Git interaction library.
2 bup repositories are in Git format. This library allows us to
3 interact with the Git data structures.
5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
6 from bup.helpers import *
7 from bup import _helpers, path
10 SEEK_END=2 # os.SEEK_END is not defined in python 2.4
12 """Discussion of bloom constants for bup:
14 There are four basic things to consider when building a bloom filter:
15 The size, in bits, of the filter
16 The capacity, in entries, of the filter
17 The probability of a false positive that is tolerable
18 The number of bits readily available to use for addresing filter bits
20 There is one major tunable that is not directly related to the above:
21 k: the number of bits set in the filter per entry
23 Here's a wall of numbers showing the relationship between k; the ratio between
24 the filter size in bits and the entries in the filter; and pfalse_positive:
26 mn|k=3 |k=4 |k=5 |k=6 |k=7 |k=8 |k=9 |k=10 |k=11
27 8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
28 9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
29 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
30 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
31 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
32 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
33 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
34 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
35 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
36 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
37 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
38 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
39 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
40 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
41 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
42 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
43 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
44 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
45 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
46 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
47 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
48 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
49 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
50 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
51 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
53 Here's a table showing available repository size for a given pfalse_positive
54 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
55 filter and 8192bytes per object):
57 pfalse|obj k=4 |cap k=4 |obj k=5 |cap k=5 |obj k=6 |cap k=6
58 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
59 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
60 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
62 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
65 filter size scales linearly with repository size for a given k and pfalse.
67 Here's a table of filter sizes for a 1 TiB repository:
69 pfalse| k=3 | k=4 | k=5 | k=6
70 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
71 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
72 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
75 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
76 per lookup will be worse than the two required for midx.
77 * We want the pfalse_positive to be low enough that the cost of sometimes
78 faulting on the midx doesn't overcome the benefit of the bloom filter.
79 * We have readily available 160 bits for addressing the filter.
80 * We want to be able to have a single bloom address entire repositories of
83 Based on these parameters, a combination of k=4 and k=5 provides the behavior
84 that bup needs. As such, I've implemented bloom addressing, adding and
85 checking functions in C for these two values. Because k=5 requires less space
86 and gives better overall pfalse_positive perofrmance, it is preferred if a
87 table with k=5 can represent the repository.
89 None of this tells us what max_pfalse_positive to choose.
91 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
94 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
95 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
96 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
100 home_repodir = os.path.expanduser('~/.bup')
103 _typemap = { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
104 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
110 class GitError(Exception):
115 """Get the path to the git repository or one of its subdirectories."""
118 raise GitError('You should call check_repo_or_die()')
120 # If there's a .git subdirectory, then the actual repo is in there.
121 gd = os.path.join(repodir, '.git')
122 if os.path.exists(gd):
125 return os.path.join(repodir, sub)
129 paths = [repo('objects/pack')]
130 paths += glob.glob(repo('index-cache/*/.'))
134 def auto_midx(objdir):
135 args = [path.exe(), 'midx', '--auto', '--dir', objdir]
137 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
139 # make sure 'args' gets printed to help with debugging
140 add_error('%r: exception: %s' % (args, e))
143 add_error('%r: returned %d' % (args, rv))
145 args = [path.exe(), 'bloom', '--dir', objdir]
147 rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
149 # make sure 'args' gets printed to help with debugging
150 add_error('%r: exception: %s' % (args, e))
153 add_error('%r: returned %d' % (args, rv))
156 def mangle_name(name, mode, gitmode):
157 """Mangle a file name to present an abstract name for segmented files.
158 Mangled file names will have the ".bup" extension added to them. If a
159 file's name already ends with ".bup", a ".bupl" extension is added to
160 disambiguate normal files from semgmented ones.
162 if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
164 elif name.endswith('.bup') or name[:-1].endswith('.bup'):
165 return name + '.bupl'
170 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
171 def demangle_name(name):
172 """Remove name mangling from a file name, if necessary.
174 The return value is a tuple (demangled_filename,mode), where mode is one of
177 * BUP_NORMAL : files that should be read as-is from the repository
178 * BUP_CHUNKED : files that were chunked and need to be assembled
180 For more information on the name mangling algorythm, see mangle_name()
182 if name.endswith('.bupl'):
183 return (name[:-5], BUP_NORMAL)
184 elif name.endswith('.bup'):
185 return (name[:-4], BUP_CHUNKED)
187 return (name, BUP_NORMAL)
190 def _encode_packobj(type, content):
193 szbits = (sz & 0x0f) | (_typemap[type]<<4)
196 if sz: szbits |= 0x80
202 z = zlib.compressobj(1)
204 yield z.compress(content)
208 def _encode_looseobj(type, content):
209 z = zlib.compressobj(1)
210 yield z.compress('%s %d\0' % (type, len(content)))
211 yield z.compress(content)
215 def _decode_looseobj(buf):
217 s = zlib.decompress(buf)
224 assert(type in _typemap)
225 assert(sz == len(content))
226 return (type, content)
229 def _decode_packobj(buf):
232 type = _typermap[(c & 0x70) >> 4]
239 sz |= (c & 0x7f) << shift
243 return (type, zlib.decompress(buf[i+1:]))
250 def find_offset(self, hash):
251 """Get the offset of an object inside the index file."""
252 idx = self._idx_from_hash(hash)
254 return self._ofs_from_idx(idx)
257 def exists(self, hash, want_source=False):
258 """Return nonempty if the object exists in this index."""
259 if hash and (self._idx_from_hash(hash) != None):
260 return want_source and os.path.basename(self.name) or True
264 return int(self.fanout[255])
266 def _idx_from_hash(self, hash):
267 global _total_searches, _total_steps
269 assert(len(hash) == 20)
271 start = self.fanout[b1-1] # range -1..254
272 end = self.fanout[b1] # range 0..255
274 _total_steps += 1 # lookup table is a step
277 mid = start + (end-start)/2
278 v = self._idx_to_hash(mid)
288 class PackIdxV1(PackIdx):
289 """Object representation of a Git pack index (version 1) file."""
290 def __init__(self, filename, f):
292 self.idxnames = [self.name]
293 self.map = mmap_read(f)
294 self.fanout = list(struct.unpack('!256I',
295 str(buffer(self.map, 0, 256*4))))
296 self.fanout.append(0) # entry "-1"
297 nsha = self.fanout[255]
299 self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
301 def _ofs_from_idx(self, idx):
302 return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
304 def _idx_to_hash(self, idx):
305 return str(self.shatable[idx*24+4 : idx*24+24])
308 for i in xrange(self.fanout[255]):
309 yield buffer(self.map, 256*4 + 24*i + 4, 20)
312 class PackIdxV2(PackIdx):
313 """Object representation of a Git pack index (version 2) file."""
314 def __init__(self, filename, f):
316 self.idxnames = [self.name]
317 self.map = mmap_read(f)
318 assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
319 self.fanout = list(struct.unpack('!256I',
320 str(buffer(self.map, 8, 256*4))))
321 self.fanout.append(0) # entry "-1"
322 nsha = self.fanout[255]
323 self.sha_ofs = 8 + 256*4
324 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
325 self.ofstable = buffer(self.map,
326 self.sha_ofs + nsha*20 + nsha*4,
328 self.ofs64table = buffer(self.map,
329 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
331 def _ofs_from_idx(self, idx):
332 ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
334 idx64 = ofs & 0x7fffffff
335 ofs = struct.unpack('!Q',
336 str(buffer(self.ofs64table, idx64*8, 8)))[0]
339 def _idx_to_hash(self, idx):
340 return str(self.shatable[idx*20:(idx+1)*20])
343 for i in xrange(self.fanout[255]):
344 yield buffer(self.map, 8 + 256*4 + 20*i, 20)
347 extract_bits = _helpers.extract_bits
349 bloom_contains = _helpers.bloom_contains
350 bloom_add = _helpers.bloom_add
354 """Wrapper which contains data from multiple index files.
356 def __init__(self, filename, f=None, readwrite=False, expected=-1):
360 assert(filename.endswith('.bloom'))
363 self.rwfile = f = f or open(filename, 'r+b')
366 # Decide if we want to mmap() the pages as writable ('immediate'
367 # write) or else map them privately for later writing back to
368 # the file ('delayed' write). A bloom table's write access
369 # pattern is such that we dirty almost all the pages after adding
370 # very few entries. But the table is so big that dirtying
371 # *all* the pages often exceeds Linux's default
372 # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
373 # thus causing it to start flushing the table before we're
374 # finished... even though there's more than enough space to
375 # store the bloom table in RAM.
377 # To work around that behaviour, if we calculate that we'll
378 # probably end up touching the whole table anyway (at least
379 # one bit flipped per memory page), let's use a "private" mmap,
380 # which defeats Linux's ability to flush it to disk. Then we'll
381 # flush it as one big lump during close().
382 pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
383 self.delaywrite = expected > pages
384 debug1('bloom: delaywrite=%r\n' % self.delaywrite)
386 self.map = mmap_readwrite_private(self.rwfile, close=False)
388 self.map = mmap_readwrite(self.rwfile, close=False)
391 f = f or open(filename, 'rb')
392 self.map = mmap_read(f)
393 got = str(self.map[0:4])
395 log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
396 return self._init_failed()
397 ver = struct.unpack('!I', self.map[4:8])[0]
398 if ver < BLOOM_VERSION:
399 log('Warning: ignoring old-style (v%d) bloom %r\n'
401 return self._init_failed()
402 if ver > BLOOM_VERSION:
403 log('Warning: ignoring too-new (v%d) bloom %r\n'
405 return self._init_failed()
407 self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
408 idxnamestr = str(self.map[16 + 2**self.bits:])
410 self.idxnames = idxnamestr.split('\0')
414 def _init_failed(self):
421 self.bits = self.entries = 0
424 return self.map and self.bits
430 if self.map and self.rwfile:
431 debug2("bloom: closing with %d entries\n" % self.entries)
432 self.map[12:16] = struct.pack('!I', self.entries)
435 self.rwfile.write(self.map)
438 self.rwfile.seek(16 + 2**self.bits)
440 self.rwfile.write('\0'.join(self.idxnames))
443 def pfalse_positive(self, additional=0):
444 n = self.entries + additional
447 return 100*(1-math.exp(-k*float(n)/m))**k
449 def add_idx(self, ix):
450 """Add the object to the filter, return current pfalse_positive."""
451 if not self.map: raise Exception, "Cannot add to closed bloom"
452 self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
453 self.idxnames.append(os.path.basename(ix.name))
455 def exists(self, sha):
456 """Return nonempty if the object probably exists in the bloom filter."""
457 global _total_searches, _total_steps
459 if not self.map: return None
460 found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
461 _total_steps += steps
465 def create(cls, name, expected, delaywrite=None, f=None, k=None):
466 """Create and return a bloom filter for `expected` entries."""
467 bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
468 k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
469 if bits > MAX_BLOOM_BITS[k]:
470 log('bloom: warning, max bits exceeded, non-optimal\n')
471 bits = MAX_BLOOM_BITS[k]
472 debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
473 f = f or open(name, 'w+b')
475 f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
476 assert(f.tell() == 16)
477 # NOTE: On some systems this will not extend+zerofill, but it does on
478 # darwin, linux, bsd and solaris.
479 f.truncate(16+2**bits)
481 if delaywrite != None and not delaywrite:
482 # tell it to expect very few objects, forcing a direct mmap
484 return cls(name, f=f, readwrite=True, expected=expected)
487 return int(self.entries)
491 """Wrapper which contains data from multiple index files.
492 Multiple index (.midx) files constitute a wrapper around index (.idx) files
493 and make it possible for bup to expand Git's indexing capabilities to vast
496 def __init__(self, filename):
498 self.force_keep = False
499 assert(filename.endswith('.midx'))
500 self.map = mmap_read(open(filename))
501 if str(self.map[0:4]) != 'MIDX':
502 log('Warning: skipping: invalid MIDX header in %r\n' % filename)
503 self.force_keep = True
504 return self._init_failed()
505 ver = struct.unpack('!I', self.map[4:8])[0]
506 if ver < MIDX_VERSION:
507 log('Warning: ignoring old-style (v%d) midx %r\n'
509 self.force_keep = False # old stuff is boring
510 return self._init_failed()
511 if ver > MIDX_VERSION:
512 log('Warning: ignoring too-new (v%d) midx %r\n'
514 self.force_keep = True # new stuff is exciting
515 return self._init_failed()
517 self.bits = _helpers.firstword(self.map[8:12])
518 self.entries = 2**self.bits
519 self.fanout = buffer(self.map, 12, self.entries*4)
520 self.sha_ofs = 12 + self.entries*4
521 self.nsha = nsha = self._fanget(self.entries-1)
522 self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
523 self.which_ofs = self.sha_ofs + 20*nsha
524 self.whichlist = buffer(self.map, self.which_ofs, nsha*4)
525 self.idxnames = str(self.map[self.which_ofs + 4*nsha:]).split('\0')
527 def _init_failed(self):
530 self.fanout = buffer('\0\0\0\0')
531 self.shatable = buffer('\0'*20)
534 def _fanget(self, i):
536 s = self.fanout[start:start+4]
537 return _helpers.firstword(s)
540 return str(self.shatable[i*20:(i+1)*20])
542 def _get_idx_i(self, i):
543 return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
545 def _get_idxname(self, i):
546 return self.idxnames[self._get_idx_i(i)]
548 def exists(self, hash, want_source=False):
549 """Return nonempty if the object exists in the index files."""
550 global _total_searches, _total_steps
553 el = extract_bits(want, self.bits)
555 start = self._fanget(el-1)
556 startv = el << (32-self.bits)
560 end = self._fanget(el)
561 endv = (el+1) << (32-self.bits)
562 _total_steps += 1 # lookup table is a step
563 hashv = _helpers.firstword(hash)
564 #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
567 #print '! %08x %08x %08x %d - %d' % (startv, hashv, endv, start, end)
568 mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
569 #print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
571 #print ' %08x' % self._num(v)
574 startv = _helpers.firstword(v)
577 endv = _helpers.firstword(v)
579 return want_source and self._get_idxname(mid) or True
583 for i in xrange(self._fanget(self.entries-1)):
584 yield buffer(self.shatable, i*20, 20)
587 return int(self._fanget(self.entries-1))
592 def __init__(self, dir):
594 assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
599 self.do_bloom = False
606 assert(_mpi_count == 0)
609 return iter(idxmerge(self.packs))
612 return sum(len(pack) for pack in self.packs)
614 def exists(self, hash, want_source=False):
615 """Return nonempty if the object exists in the index files."""
616 global _total_searches
618 if hash in self.also:
620 if self.do_bloom and self.bloom is not None:
621 _total_searches -= 1 # will be incremented by bloom
622 if self.bloom.exists(hash):
623 self.do_bloom = False
626 for i in xrange(len(self.packs)):
628 _total_searches -= 1 # will be incremented by sub-pack
629 ix = p.exists(hash, want_source=want_source)
631 # reorder so most recently used packs are searched first
632 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
637 def refresh(self, skip_midx = False):
638 """Refresh the index list.
639 This method verifies if .midx files were superseded (e.g. all of its
640 contents are in another, bigger .midx file) and removes the superseded
643 If skip_midx is True, all work on .midx files will be skipped and .midx
644 files will be removed from the list.
646 The module-global variable 'ignore_midx' can force this function to
647 always act as if skip_midx was True.
649 self.bloom = None # Always reopen the bloom as it may have been relaced
650 self.do_bloom = False
651 skip_midx = skip_midx or ignore_midx
652 d = dict((p.name, p) for p in self.packs
653 if not skip_midx or not isinstance(p, PackMidx))
654 if os.path.exists(self.dir):
657 for ix in self.packs:
658 if isinstance(ix, PackMidx):
659 for name in ix.idxnames:
660 d[os.path.join(self.dir, name)] = ix
661 for full in glob.glob(os.path.join(self.dir,'*.midx')):
664 (mxd, mxf) = os.path.split(mx.name)
666 for n in mx.idxnames:
667 if not os.path.exists(os.path.join(mxd, n)):
668 log(('warning: index %s missing\n' +
669 ' used by %s\n') % (n, mxf))
676 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
679 for sub in ix.idxnames:
680 found = d.get(os.path.join(self.dir, sub))
681 if not found or isinstance(found, PackIdx):
682 # doesn't exist, or exists but not in a midx
687 for name in ix.idxnames:
688 d[os.path.join(self.dir, name)] = ix
689 elif not ix.force_keep:
690 debug1('midx: removing redundant: %s\n'
691 % os.path.basename(ix.name))
693 for full in glob.glob(os.path.join(self.dir,'*.idx')):
701 bfull = os.path.join(self.dir, 'bup.bloom')
702 if self.bloom is None and os.path.exists(bfull):
703 self.bloom = ShaBloom(bfull)
704 self.packs = list(set(d.values()))
705 self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
706 if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
710 debug1('PackIdxList: using %d index%s.\n'
711 % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
714 """Insert an additional object in the list."""
718 def calc_hash(type, content):
719 """Calculate some content's hash in the Git fashion."""
720 header = '%s %d\0' % (type, len(content))
726 def _shalist_sort_key(ent):
727 (mode, name, id) = ent
728 if stat.S_ISDIR(int(mode, 8)):
734 def open_idx(filename):
735 if filename.endswith('.idx'):
736 f = open(filename, 'rb')
738 if header[0:4] == '\377tOc':
739 version = struct.unpack('!I', header[4:8])[0]
741 return PackIdxV2(filename, f)
743 raise GitError('%s: expected idx file version 2, got %d'
744 % (filename, version))
745 elif len(header) == 8 and header[0:4] < '\377tOc':
746 return PackIdxV1(filename, f)
748 raise GitError('%s: unrecognized idx file header' % filename)
749 elif filename.endswith('.midx'):
750 return PackMidx(filename)
752 raise GitError('idx filenames must end with .idx or .midx')
755 def idxmerge(idxlist, final_progress=True):
756 """Generate a list of all the objects reachable in a PackIdxList."""
757 def pfunc(count, total):
758 progress('Reading indexes: %.2f%% (%d/%d)\r'
759 % (count*100.0/total, count, total))
760 def pfinal(count, total):
762 log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
763 return merge_iter(idxlist, 10024, pfunc, pfinal)
766 def _make_objcache():
767 return PackIdxList(repo('objects/pack'))
770 """Writes Git objects insid a pack file."""
771 def __init__(self, objcache_maker=_make_objcache):
777 self.objcache_maker = objcache_maker
785 (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
786 self.file = os.fdopen(fd, 'w+b')
787 assert(name.endswith('.pack'))
788 self.filename = name[:-5]
789 self.file.write('PACK\0\0\0\2\0\0\0\0')
790 self.idx = list(list() for i in xrange(256))
792 def _raw_write(self, datalist, sha):
795 # in case we get interrupted (eg. KeyboardInterrupt), it's best if
796 # the file never has a *partial* blob. So let's make sure it's
797 # all-or-nothing. (The blob shouldn't be very big anyway, thanks
798 # to our hashsplit algorithm.) f.write() does its own buffering,
799 # but that's okay because we'll flush it in _end().
800 oneblob = ''.join(datalist)
804 raise GitError, e, sys.exc_info()[2]
806 crc = zlib.crc32(oneblob) & 0xffffffff
807 self._update_idx(sha, crc, nw)
812 def _update_idx(self, sha, crc, size):
815 self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
817 def _write(self, sha, type, content):
821 sha = calc_hash(type, content)
822 size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
825 def breakpoint(self):
826 """Clear byte and object counts and return the last processed id."""
828 self.outbytes = self.count = 0
831 def _require_objcache(self):
832 if self.objcache is None and self.objcache_maker:
833 self.objcache = self.objcache_maker()
834 if self.objcache is None:
836 "PackWriter not opened or can't check exists w/o objcache")
838 def exists(self, id, want_source=False):
839 """Return non-empty if an object is found in the object cache."""
840 self._require_objcache()
841 return self.objcache.exists(id, want_source=want_source)
843 def maybe_write(self, type, content):
844 """Write an object to the pack file if not present and return its id."""
845 self._require_objcache()
846 sha = calc_hash(type, content)
847 if not self.exists(sha):
848 self._write(sha, type, content)
849 self.objcache.add(sha)
852 def new_blob(self, blob):
853 """Create a blob object in the pack with the supplied content."""
854 return self.maybe_write('blob', blob)
856 def new_tree(self, shalist):
857 """Create a tree object in the pack."""
858 shalist = sorted(shalist, key = _shalist_sort_key)
860 for (mode,name,bin) in shalist:
863 assert(mode[0] != '0')
865 assert(len(bin) == 20)
866 l.append('%s %s\0%s' % (mode,name,bin))
867 return self.maybe_write('tree', ''.join(l))
869 def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
871 if tree: l.append('tree %s' % tree.encode('hex'))
872 if parent: l.append('parent %s' % parent.encode('hex'))
873 if author: l.append('author %s %s' % (author, _git_date(adate)))
874 if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
877 return self.maybe_write('commit', '\n'.join(l))
879 def new_commit(self, parent, tree, date, msg):
880 """Create a commit object in the pack."""
881 userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
882 commit = self._new_commit(tree, parent,
883 userline, date, userline, date,
888 """Remove the pack file from disk."""
894 os.unlink(self.filename + '.pack')
896 def _end(self, run_midx=True):
898 if not f: return None
904 # update object count
906 cp = struct.pack('!i', self.count)
910 # calculate the pack sha1sum
913 for b in chunkyreader(f):
915 packbin = sum.digest()
919 obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
921 nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
922 if os.path.exists(self.filename + '.map'):
923 os.unlink(self.filename + '.map')
924 os.rename(self.filename + '.pack', nameprefix + '.pack')
925 os.rename(self.filename + '.idx', nameprefix + '.idx')
928 auto_midx(repo('objects/pack'))
931 def close(self, run_midx=True):
932 """Close the pack file and move it to its definitive path."""
933 return self._end(run_midx=run_midx)
935 def _write_pack_idx_v2(self, filename, idx, packbin):
936 idx_f = open(filename, 'w+b')
937 idx_f.write('\377tOc\0\0\0\2')
939 ofs64_ofs = 8 + 4*256 + 28*self.count
940 idx_f.truncate(ofs64_ofs)
942 idx_map = mmap_readwrite(idx_f, close=False)
943 idx_f.seek(0, SEEK_END)
944 count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
945 assert(count == self.count)
951 b = idx_f.read(8 + 4*256)
954 obj_list_sum = Sha1()
955 for b in chunkyreader(idx_f, 20*self.count):
957 obj_list_sum.update(b)
958 namebase = obj_list_sum.hexdigest()
960 for b in chunkyreader(idx_f):
962 idx_f.write(idx_sum.digest())
969 return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
973 os.environ['GIT_DIR'] = os.path.abspath(repo())
976 def list_refs(refname = None):
977 """Generate a list of tuples in the form (refname,hash).
978 If a ref name is specified, list only this particular ref.
980 argv = ['git', 'show-ref', '--']
983 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
984 out = p.stdout.read().strip()
985 rv = p.wait() # not fatal
989 for d in out.split('\n'):
990 (sha, name) = d.split(' ', 1)
991 yield (name, sha.decode('hex'))
994 def read_ref(refname):
995 """Get the commit id of the most recent commit made on a given ref."""
996 l = list(list_refs(refname))
1004 def rev_list(ref, count=None):
1005 """Generate a list of reachable commits in reverse chronological order.
1007 This generator walks through commits, from child to parent, that are
1008 reachable via the specified ref and yields a series of tuples of the form
1011 If count is a non-zero integer, limit the number of commits to "count"
1014 assert(not ref.startswith('-'))
1017 opts += ['-n', str(atoi(count))]
1018 argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
1019 p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
1021 for row in p.stdout:
1023 if s.startswith('commit '):
1024 commit = s[7:].decode('hex')
1027 yield (date, commit)
1028 rv = p.wait() # not fatal
1030 raise GitError, 'git rev-list returned error %d' % rv
1033 def rev_get_date(ref):
1034 """Get the date of the latest commit on the specified ref."""
1035 for (date, commit) in rev_list(ref, count=1):
1037 raise GitError, 'no such commit %r' % ref
1040 def rev_parse(committish):
1041 """Resolve the full hash for 'committish', if it exists.
1043 Should be roughly equivalent to 'git rev-parse'.
1045 Returns the hex value of the hash if it is found, None if 'committish' does
1046 not correspond to anything.
1048 head = read_ref(committish)
1050 debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1053 pL = PackIdxList(repo('objects/pack'))
1055 if len(committish) == 40:
1057 hash = committish.decode('hex')
1067 def update_ref(refname, newval, oldval):
1068 """Change the commit pointed to by a branch."""
1071 assert(refname.startswith('refs/heads/'))
1072 p = subprocess.Popen(['git', 'update-ref', refname,
1073 newval.encode('hex'), oldval.encode('hex')],
1074 preexec_fn = _gitenv)
1075 _git_wait('git update-ref', p)
1078 def guess_repo(path=None):
1079 """Set the path value in the global variable "repodir".
1080 This makes bup look for an existing bup repository, but not fail if a
1081 repository doesn't exist. Usually, if you are interacting with a bup
1082 repository, you would not be calling this function but using
1083 check_repo_or_die().
1089 repodir = os.environ.get('BUP_DIR')
1091 repodir = os.path.expanduser('~/.bup')
1094 def init_repo(path=None):
1095 """Create the Git bare repository for bup in a given path."""
1097 d = repo() # appends a / to the path
1098 parent = os.path.dirname(os.path.dirname(d))
1099 if parent and not os.path.exists(parent):
1100 raise GitError('parent directory "%s" does not exist\n' % parent)
1101 if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1102 raise GitError('"%d" exists but is not a directory\n' % d)
1103 p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1104 preexec_fn = _gitenv)
1105 _git_wait('git init', p)
1106 # Force the index version configuration in order to ensure bup works
1107 # regardless of the version of the installed Git binary.
1108 p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1109 stdout=sys.stderr, preexec_fn = _gitenv)
1110 _git_wait('git config', p)
1113 def check_repo_or_die(path=None):
1114 """Make sure a bup repository exists, and abort if not.
1115 If the path to a particular repository was not specified, this function
1116 initializes the default repository automatically.
1119 if not os.path.isdir(repo('objects/pack/.')):
1120 if repodir == home_repodir:
1123 log('error: %r is not a bup/git repository\n' % repo())
1128 """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1130 while ofs < len(buf):
1131 z = buf[ofs:].find('\0')
1133 spl = buf[ofs:ofs+z].split(' ', 1)
1134 assert(len(spl) == 2)
1135 sha = buf[ofs+z+1:ofs+z+1+20]
1137 yield (spl[0], spl[1], sha)
1142 """Get Git's version and ensure a usable version is installed.
1144 The returned version is formatted as an ordered tuple with each position
1145 representing a digit in the version tag. For example, the following tuple
1146 would represent version 1.6.6.9:
1148 ('1', '6', '6', '9')
1152 p = subprocess.Popen(['git', '--version'],
1153 stdout=subprocess.PIPE)
1154 gvs = p.stdout.read()
1155 _git_wait('git --version', p)
1156 m = re.match(r'git version (\S+.\S+)', gvs)
1158 raise GitError('git --version weird output: %r' % gvs)
1159 _ver = tuple(m.group(1).split('.'))
1160 needed = ('1','5', '3', '1')
1162 raise GitError('git version %s or higher is required; you have %s'
1163 % ('.'.join(needed), '.'.join(_ver)))
1167 def _git_wait(cmd, p):
1170 raise GitError('%s returned %d' % (cmd, rv))
1173 def _git_capture(argv):
1174 p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1176 _git_wait(repr(argv), p)
1180 class _AbortableIter:
1181 def __init__(self, it, onabort = None):
1183 self.onabort = onabort
1191 return self.it.next()
1192 except StopIteration, e:
1200 """Abort iteration and call the abortion callback, if needed."""
1212 """Link to 'git cat-file' that is used to retrieve blob data."""
1215 wanted = ('1','5','6')
1218 log('warning: git version < %s; bup will be slow.\n'
1221 self.get = self._slow_get
1223 self.p = self.inprogress = None
1224 self.get = self._fast_get
1228 self.p.stdout.close()
1229 self.p.stdin.close()
1231 self.inprogress = None
1235 self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1236 stdin=subprocess.PIPE,
1237 stdout=subprocess.PIPE,
1240 preexec_fn = _gitenv)
1242 def _fast_get(self, id):
1243 if not self.p or self.p.poll() != None:
1246 assert(self.p.poll() == None)
1248 log('_fast_get: opening %r while %r is open'
1249 % (id, self.inprogress))
1250 assert(not self.inprogress)
1251 assert(id.find('\n') < 0)
1252 assert(id.find('\r') < 0)
1253 assert(not id.startswith('-'))
1254 self.inprogress = id
1255 self.p.stdin.write('%s\n' % id)
1256 self.p.stdin.flush()
1257 hdr = self.p.stdout.readline()
1258 if hdr.endswith(' missing\n'):
1259 self.inprogress = None
1260 raise KeyError('blob %r is missing' % id)
1261 spl = hdr.split(' ')
1262 if len(spl) != 3 or len(spl[0]) != 40:
1263 raise GitError('expected blob, got %r' % spl)
1264 (hex, type, size) = spl
1266 it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1267 onabort = self._abort)
1272 assert(self.p.stdout.readline() == '\n')
1273 self.inprogress = None
1274 except Exception, e:
1278 def _slow_get(self, id):
1279 assert(id.find('\n') < 0)
1280 assert(id.find('\r') < 0)
1281 assert(id[0] != '-')
1282 type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1285 p = subprocess.Popen(['git', 'cat-file', type, id],
1286 stdout=subprocess.PIPE,
1287 preexec_fn = _gitenv)
1288 for blob in chunkyreader(p.stdout):
1290 _git_wait('git cat-file', p)
1292 def _join(self, it):
1297 elif type == 'tree':
1298 treefile = ''.join(it)
1299 for (mode, name, sha) in treeparse(treefile):
1300 for blob in self.join(sha.encode('hex')):
1302 elif type == 'commit':
1303 treeline = ''.join(it).split('\n')[0]
1304 assert(treeline.startswith('tree '))
1305 for blob in self.join(treeline[5:]):
1308 raise GitError('invalid object type %r: expected blob/tree/commit'
1312 """Generate a list of the content of all blobs that can be reached
1313 from an object. The hash given in 'id' must point to a blob, a tree
1314 or a commit. The content of all blobs that can be seen from trees or
1315 commits will be added to the list.
1318 for d in self._join(self.get(id)):
1320 except StopIteration:
1324 """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1326 for (n,c) in list_refs():
1327 if n.startswith('refs/tags/'):
1332 tags[c].append(name) # more than one tag can point at 'c'