lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path
   8
   9 MIDX_VERSION = 4
  10 SEEK_END=2  # os.SEEK_END is not defined in python 2.4
  11
  12 """Discussion of bloom constants for bup:
  13
  14 There are four basic things to consider when building a bloom filter:
  15 The size, in bits, of the filter
  16 The capacity, in entries, of the filter
  17 The probability of a false positive that is tolerable
  18 The number of bits readily available to use for addresing filter bits
  19
  20 There is one major tunable that is not directly related to the above:
  21 k: the number of bits set in the filter per entry
  22
  23 Here's a wall of numbers showing the relationship between k; the ratio between
  24 the filter size in bits and the entries in the filter; and pfalse_positive:
  25
  26 mn|k=3    |k=4    |k=5    |k=6    |k=7    |k=8    |k=9    |k=10   |k=11
  27  8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
  28  9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
  29 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
  30 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
  31 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
  32 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
  33 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
  34 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
  35 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
  36 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
  37 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
  38 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
  39 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
  40 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
  41 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
  42 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
  43 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
  44 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
  45 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
  46 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
  47 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
  48 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
  49 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
  50 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
  51 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
  52
  53 Here's a table showing available repository size for a given pfalse_positive
  54 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
  55 filter and 8192bytes per object):
  56
  57 pfalse|obj k=4     |cap k=4    |obj k=5  |cap k=5    |obj k=6 |cap k=6
  58 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
  59 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
  60 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
  61
  62 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
  63 addressing.
  64
  65 filter size scales linearly with repository size for a given k and pfalse.
  66
  67 Here's a table of filter sizes for a 1 TiB repository:
  68
  69 pfalse| k=3        | k=4        | k=5        | k=6
  70 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
  71 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
  72 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
  73
  74 For bup:
  75 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
  76 per lookup will be worse than the two required for midx.
  77 * We want the pfalse_positive to be low enough that the cost of sometimes
  78 faulting on the midx doesn't overcome the benefit of the bloom filter.
  79 * We have readily available 160 bits for addressing the filter.
  80 * We want to be able to have a single bloom address entire repositories of
  81 reasonable size.
  82
  83 Based on these parameters, a combination of k=4 and k=5 provides the behavior
  84 that bup needs.  As such, I've implemented bloom addressing, adding and
  85 checking functions in C for these two values.  Because k=5 requires less space
  86 and gives better overall pfalse_positive perofrmance, it is preferred if a
  87 table with k=5 can represent the repository.
  88
  89 None of this tells us what max_pfalse_positive to choose.
  90
  91 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
  92 """
  93 BLOOM_VERSION = 2
  94 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
  95 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
  96 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
  97
  98 verbose = 0
  99 ignore_midx = 0
 100 home_repodir = os.path.expanduser('~/.bup')
 101 repodir = None
 102
 103 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
 104 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
 105
 106 _total_searches = 0
 107 _total_steps = 0
 108
 109
 110 class GitError(Exception):
 111     pass
 112
 113
 114 def repo(sub = ''):
 115     """Get the path to the git repository or one of its subdirectories."""
 116     global repodir
 117     if not repodir:
 118         raise GitError('You should call check_repo_or_die()')
 119
 120     # If there's a .git subdirectory, then the actual repo is in there.
 121     gd = os.path.join(repodir, '.git')
 122     if os.path.exists(gd):
 123         repodir = gd
 124
 125     return os.path.join(repodir, sub)
 126
 127
 128 def repo_rel(path):
 129     full = os.path.abspath(path)
 130     fullrepo = os.path.abspath(repo(''))
 131     if not fullrepo.endswith('/'):
 132         fullrepo += '/'
 133     if full.startswith(fullrepo):
 134         path = full[len(fullrepo):]
 135     if path.startswith('index-cache/'):
 136         path = path[len('index-cache/'):]
 137     return path
 138
 139
 140 def all_packdirs():
 141     paths = [repo('objects/pack')]
 142     paths += glob.glob(repo('index-cache/*/.'))
 143     return paths
 144
 145
 146 def auto_midx(objdir):
 147     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 148     try:
 149         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 150     except OSError, e:
 151         # make sure 'args' gets printed to help with debugging
 152         add_error('%r: exception: %s' % (args, e))
 153         raise
 154     if rv:
 155         add_error('%r: returned %d' % (args, rv))
 156
 157     args = [path.exe(), 'bloom', '--dir', objdir]
 158     try:
 159         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 160     except OSError, e:
 161         # make sure 'args' gets printed to help with debugging
 162         add_error('%r: exception: %s' % (args, e))
 163         raise
 164     if rv:
 165         add_error('%r: returned %d' % (args, rv))
 166
 167
 168 def mangle_name(name, mode, gitmode):
 169     """Mangle a file name to present an abstract name for segmented files.
 170     Mangled file names will have the ".bup" extension added to them. If a
 171     file's name already ends with ".bup", a ".bupl" extension is added to
 172     disambiguate normal files from semgmented ones.
 173     """
 174     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 175         return name + '.bup'
 176     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 177         return name + '.bupl'
 178     else:
 179         return name
 180
 181
 182 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 183 def demangle_name(name):
 184     """Remove name mangling from a file name, if necessary.
 185
 186     The return value is a tuple (demangled_filename,mode), where mode is one of
 187     the following:
 188
 189     * BUP_NORMAL  : files that should be read as-is from the repository
 190     * BUP_CHUNKED : files that were chunked and need to be assembled
 191
 192     For more information on the name mangling algorythm, see mangle_name()
 193     """
 194     if name.endswith('.bupl'):
 195         return (name[:-5], BUP_NORMAL)
 196     elif name.endswith('.bup'):
 197         return (name[:-4], BUP_CHUNKED)
 198     else:
 199         return (name, BUP_NORMAL)
 200
 201
 202 def _encode_packobj(type, content):
 203     szout = ''
 204     sz = len(content)
 205     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 206     sz >>= 4
 207     while 1:
 208         if sz: szbits |= 0x80
 209         szout += chr(szbits)
 210         if not sz:
 211             break
 212         szbits = sz & 0x7f
 213         sz >>= 7
 214     z = zlib.compressobj(1)
 215     yield szout
 216     yield z.compress(content)
 217     yield z.flush()
 218
 219
 220 def _encode_looseobj(type, content):
 221     z = zlib.compressobj(1)
 222     yield z.compress('%s %d\0' % (type, len(content)))
 223     yield z.compress(content)
 224     yield z.flush()
 225
 226
 227 def _decode_looseobj(buf):
 228     assert(buf);
 229     s = zlib.decompress(buf)
 230     i = s.find('\0')
 231     assert(i > 0)
 232     l = s[:i].split(' ')
 233     type = l[0]
 234     sz = int(l[1])
 235     content = s[i+1:]
 236     assert(type in _typemap)
 237     assert(sz == len(content))
 238     return (type, content)
 239
 240
 241 def _decode_packobj(buf):
 242     assert(buf)
 243     c = ord(buf[0])
 244     type = _typermap[(c & 0x70) >> 4]
 245     sz = c & 0x0f
 246     shift = 4
 247     i = 0
 248     while c & 0x80:
 249         i += 1
 250         c = ord(buf[i])
 251         sz |= (c & 0x7f) << shift
 252         shift += 7
 253         if not (c & 0x80):
 254             break
 255     return (type, zlib.decompress(buf[i+1:]))
 256
 257
 258 class PackIdx:
 259     def __init__(self):
 260         assert(0)
 261
 262     def find_offset(self, hash):
 263         """Get the offset of an object inside the index file."""
 264         idx = self._idx_from_hash(hash)
 265         if idx != None:
 266             return self._ofs_from_idx(idx)
 267         return None
 268
 269     def exists(self, hash, want_source=False):
 270         """Return nonempty if the object exists in this index."""
 271         if hash and (self._idx_from_hash(hash) != None):
 272             return want_source and os.path.basename(self.name) or True
 273         return None
 274
 275     def __len__(self):
 276         return int(self.fanout[255])
 277
 278     def _idx_from_hash(self, hash):
 279         global _total_searches, _total_steps
 280         _total_searches += 1
 281         assert(len(hash) == 20)
 282         b1 = ord(hash[0])
 283         start = self.fanout[b1-1] # range -1..254
 284         end = self.fanout[b1] # range 0..255
 285         want = str(hash)
 286         _total_steps += 1  # lookup table is a step
 287         while start < end:
 288             _total_steps += 1
 289             mid = start + (end-start)/2
 290             v = self._idx_to_hash(mid)
 291             if v < want:
 292                 start = mid+1
 293             elif v > want:
 294                 end = mid
 295             else: # got it!
 296                 return mid
 297         return None
 298
 299
 300 class PackIdxV1(PackIdx):
 301     """Object representation of a Git pack index (version 1) file."""
 302     def __init__(self, filename, f):
 303         self.name = filename
 304         self.idxnames = [self.name]
 305         self.map = mmap_read(f)
 306         self.fanout = list(struct.unpack('!256I',
 307                                          str(buffer(self.map, 0, 256*4))))
 308         self.fanout.append(0)  # entry "-1"
 309         nsha = self.fanout[255]
 310         self.sha_ofs = 256*4
 311         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 312
 313     def _ofs_from_idx(self, idx):
 314         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 315
 316     def _idx_to_hash(self, idx):
 317         return str(self.shatable[idx*24+4 : idx*24+24])
 318
 319     def __iter__(self):
 320         for i in xrange(self.fanout[255]):
 321             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 322
 323
 324 class PackIdxV2(PackIdx):
 325     """Object representation of a Git pack index (version 2) file."""
 326     def __init__(self, filename, f):
 327         self.name = filename
 328         self.idxnames = [self.name]
 329         self.map = mmap_read(f)
 330         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 331         self.fanout = list(struct.unpack('!256I',
 332                                          str(buffer(self.map, 8, 256*4))))
 333         self.fanout.append(0)  # entry "-1"
 334         nsha = self.fanout[255]
 335         self.sha_ofs = 8 + 256*4
 336         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 337         self.ofstable = buffer(self.map,
 338                                self.sha_ofs + nsha*20 + nsha*4,
 339                                nsha*4)
 340         self.ofs64table = buffer(self.map,
 341                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 342
 343     def _ofs_from_idx(self, idx):
 344         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 345         if ofs & 0x80000000:
 346             idx64 = ofs & 0x7fffffff
 347             ofs = struct.unpack('!Q',
 348                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 349         return ofs
 350
 351     def _idx_to_hash(self, idx):
 352         return str(self.shatable[idx*20:(idx+1)*20])
 353
 354     def __iter__(self):
 355         for i in xrange(self.fanout[255]):
 356             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 357
 358
 359 extract_bits = _helpers.extract_bits
 360
 361 bloom_contains = _helpers.bloom_contains
 362 bloom_add = _helpers.bloom_add
 363
 364
 365 class ShaBloom:
 366     """Wrapper which contains data from multiple index files.
 367     """
 368     def __init__(self, filename, f=None, readwrite=False, expected=-1):
 369         self.name = filename
 370         self.rwfile = None
 371         self.map = None
 372         assert(filename.endswith('.bloom'))
 373         if readwrite:
 374             assert(expected > 0)
 375             self.rwfile = f = f or open(filename, 'r+b')
 376             f.seek(0)
 377
 378             # Decide if we want to mmap() the pages as writable ('immediate'
 379             # write) or else map them privately for later writing back to
 380             # the file ('delayed' write).  A bloom table's write access
 381             # pattern is such that we dirty almost all the pages after adding
 382             # very few entries.  But the table is so big that dirtying
 383             # *all* the pages often exceeds Linux's default
 384             # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
 385             # thus causing it to start flushing the table before we're
 386             # finished... even though there's more than enough space to
 387             # store the bloom table in RAM.
 388             #
 389             # To work around that behaviour, if we calculate that we'll
 390             # probably end up touching the whole table anyway (at least
 391             # one bit flipped per memory page), let's use a "private" mmap,
 392             # which defeats Linux's ability to flush it to disk.  Then we'll
 393             # flush it as one big lump during close().
 394             pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
 395             self.delaywrite = expected > pages
 396             debug1('bloom: delaywrite=%r\n' % self.delaywrite)
 397             if self.delaywrite:
 398                 self.map = mmap_readwrite_private(self.rwfile, close=False)
 399             else:
 400                 self.map = mmap_readwrite(self.rwfile, close=False)
 401         else:
 402             self.rwfile = None
 403             f = f or open(filename, 'rb')
 404             self.map = mmap_read(f)
 405         got = str(self.map[0:4])
 406         if got != 'BLOM':
 407             log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
 408             return self._init_failed()
 409         ver = struct.unpack('!I', self.map[4:8])[0]
 410         if ver < BLOOM_VERSION:
 411             log('Warning: ignoring old-style (v%d) bloom %r\n'
 412                 % (ver, filename))
 413             return self._init_failed()
 414         if ver > BLOOM_VERSION:
 415             log('Warning: ignoring too-new (v%d) bloom %r\n'
 416                 % (ver, filename))
 417             return self._init_failed()
 418
 419         self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
 420         idxnamestr = str(self.map[16 + 2**self.bits:])
 421         if idxnamestr:
 422             self.idxnames = idxnamestr.split('\0')
 423         else:
 424             self.idxnames = []
 425
 426     def _init_failed(self):
 427         if self.map:
 428             self.map = None
 429         if self.rwfile:
 430             self.rwfile.close()
 431             self.rwfile = None
 432         self.idxnames = []
 433         self.bits = self.entries = 0
 434
 435     def valid(self):
 436         return self.map and self.bits
 437
 438     def __del__(self):
 439         self.close()
 440
 441     def close(self):
 442         if self.map and self.rwfile:
 443             debug2("bloom: closing with %d entries\n" % self.entries)
 444             self.map[12:16] = struct.pack('!I', self.entries)
 445             if self.delaywrite:
 446                 self.rwfile.seek(0)
 447                 self.rwfile.write(self.map)
 448             else:
 449                 self.map.flush()
 450             self.rwfile.seek(16 + 2**self.bits)
 451             if self.idxnames:
 452                 self.rwfile.write('\0'.join(self.idxnames))
 453         self._init_failed()
 454
 455     def pfalse_positive(self, additional=0):
 456         n = self.entries + additional
 457         m = 8*2**self.bits
 458         k = self.k
 459         return 100*(1-math.exp(-k*float(n)/m))**k
 460
 461     def add_idx(self, ix):
 462         """Add the object to the filter, return current pfalse_positive."""
 463         if not self.map: raise Exception, "Cannot add to closed bloom"
 464         self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
 465         self.idxnames.append(os.path.basename(ix.name))
 466
 467     def exists(self, sha):
 468         """Return nonempty if the object probably exists in the bloom filter."""
 469         global _total_searches, _total_steps
 470         _total_searches += 1
 471         if not self.map: return None
 472         found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
 473         _total_steps += steps
 474         return found
 475
 476     @classmethod
 477     def create(cls, name, expected, delaywrite=None, f=None, k=None):
 478         """Create and return a bloom filter for `expected` entries."""
 479         bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
 480         k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
 481         if bits > MAX_BLOOM_BITS[k]:
 482             log('bloom: warning, max bits exceeded, non-optimal\n')
 483             bits = MAX_BLOOM_BITS[k]
 484         debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
 485         f = f or open(name, 'w+b')
 486         f.write('BLOM')
 487         f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
 488         assert(f.tell() == 16)
 489         # NOTE: On some systems this will not extend+zerofill, but it does on
 490         # darwin, linux, bsd and solaris.
 491         f.truncate(16+2**bits)
 492         f.seek(0)
 493         if delaywrite != None and not delaywrite:
 494             # tell it to expect very few objects, forcing a direct mmap
 495             expected = 1
 496         return cls(name, f=f, readwrite=True, expected=expected)
 497
 498     def __len__(self):
 499         return int(self.entries)
 500
 501
 502 class PackMidx:
 503     """Wrapper which contains data from multiple index files.
 504     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 505     and make it possible for bup to expand Git's indexing capabilities to vast
 506     amounts of files.
 507     """
 508     def __init__(self, filename):
 509         self.name = filename
 510         self.force_keep = False
 511         assert(filename.endswith('.midx'))
 512         self.map = mmap_read(open(filename))
 513         if str(self.map[0:4]) != 'MIDX':
 514             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 515             self.force_keep = True
 516             return self._init_failed()
 517         ver = struct.unpack('!I', self.map[4:8])[0]
 518         if ver < MIDX_VERSION:
 519             log('Warning: ignoring old-style (v%d) midx %r\n'
 520                 % (ver, filename))
 521             self.force_keep = False  # old stuff is boring
 522             return self._init_failed()
 523         if ver > MIDX_VERSION:
 524             log('Warning: ignoring too-new (v%d) midx %r\n'
 525                 % (ver, filename))
 526             self.force_keep = True  # new stuff is exciting
 527             return self._init_failed()
 528
 529         self.bits = _helpers.firstword(self.map[8:12])
 530         self.entries = 2**self.bits
 531         self.fanout = buffer(self.map, 12, self.entries*4)
 532         self.sha_ofs = 12 + self.entries*4
 533         self.nsha = nsha = self._fanget(self.entries-1)
 534         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 535         self.which_ofs = self.sha_ofs + 20*nsha
 536         self.whichlist = buffer(self.map, self.which_ofs, nsha*4)
 537         self.idxnames = str(self.map[self.which_ofs + 4*nsha:]).split('\0')
 538
 539     def _init_failed(self):
 540         self.bits = 0
 541         self.entries = 1
 542         self.fanout = buffer('\0\0\0\0')
 543         self.shatable = buffer('\0'*20)
 544         self.idxnames = []
 545
 546     def _fanget(self, i):
 547         start = i*4
 548         s = self.fanout[start:start+4]
 549         return _helpers.firstword(s)
 550
 551     def _get(self, i):
 552         return str(self.shatable[i*20:(i+1)*20])
 553
 554     def _get_idx_i(self, i):
 555         return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
 556
 557     def _get_idxname(self, i):
 558         return self.idxnames[self._get_idx_i(i)]
 559
 560     def exists(self, hash, want_source=False):
 561         """Return nonempty if the object exists in the index files."""
 562         global _total_searches, _total_steps
 563         _total_searches += 1
 564         want = str(hash)
 565         el = extract_bits(want, self.bits)
 566         if el:
 567             start = self._fanget(el-1)
 568             startv = el << (32-self.bits)
 569         else:
 570             start = 0
 571             startv = 0
 572         end = self._fanget(el)
 573         endv = (el+1) << (32-self.bits)
 574         _total_steps += 1   # lookup table is a step
 575         hashv = _helpers.firstword(hash)
 576         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 577         while start < end:
 578             _total_steps += 1
 579             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 580             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 581             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 582             v = self._get(mid)
 583             #print '    %08x' % self._num(v)
 584             if v < want:
 585                 start = mid+1
 586                 startv = _helpers.firstword(v)
 587             elif v > want:
 588                 end = mid
 589                 endv = _helpers.firstword(v)
 590             else: # got it!
 591                 return want_source and self._get_idxname(mid) or True
 592         return None
 593
 594     def __iter__(self):
 595         for i in xrange(self._fanget(self.entries-1)):
 596             yield buffer(self.shatable, i*20, 20)
 597
 598     def __len__(self):
 599         return int(self._fanget(self.entries-1))
 600
 601
 602 _mpi_count = 0
 603 class PackIdxList:
 604     def __init__(self, dir):
 605         global _mpi_count
 606         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 607         _mpi_count += 1
 608         self.dir = dir
 609         self.also = set()
 610         self.packs = []
 611         self.do_bloom = False
 612         self.bloom = None
 613         self.refresh()
 614
 615     def __del__(self):
 616         global _mpi_count
 617         _mpi_count -= 1
 618         assert(_mpi_count == 0)
 619
 620     def __iter__(self):
 621         return iter(idxmerge(self.packs))
 622
 623     def __len__(self):
 624         return sum(len(pack) for pack in self.packs)
 625
 626     def exists(self, hash, want_source=False):
 627         """Return nonempty if the object exists in the index files."""
 628         global _total_searches
 629         _total_searches += 1
 630         if hash in self.also:
 631             return True
 632         if self.do_bloom and self.bloom is not None:
 633             _total_searches -= 1  # will be incremented by bloom
 634             if self.bloom.exists(hash):
 635                 self.do_bloom = False
 636             else:
 637                 return None
 638         for i in xrange(len(self.packs)):
 639             p = self.packs[i]
 640             _total_searches -= 1  # will be incremented by sub-pack
 641             ix = p.exists(hash, want_source=want_source)
 642             if ix:
 643                 # reorder so most recently used packs are searched first
 644                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 645                 return ix
 646         self.do_bloom = True
 647         return None
 648
 649     def refresh(self, skip_midx = False):
 650         """Refresh the index list.
 651         This method verifies if .midx files were superseded (e.g. all of its
 652         contents are in another, bigger .midx file) and removes the superseded
 653         files.
 654
 655         If skip_midx is True, all work on .midx files will be skipped and .midx
 656         files will be removed from the list.
 657
 658         The module-global variable 'ignore_midx' can force this function to
 659         always act as if skip_midx was True.
 660         """
 661         self.bloom = None # Always reopen the bloom as it may have been relaced
 662         self.do_bloom = False
 663         skip_midx = skip_midx or ignore_midx
 664         d = dict((p.name, p) for p in self.packs
 665                  if not skip_midx or not isinstance(p, PackMidx))
 666         if os.path.exists(self.dir):
 667             if not skip_midx:
 668                 midxl = []
 669                 for ix in self.packs:
 670                     if isinstance(ix, PackMidx):
 671                         for name in ix.idxnames:
 672                             d[os.path.join(self.dir, name)] = ix
 673                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 674                     if not d.get(full):
 675                         mx = PackMidx(full)
 676                         (mxd, mxf) = os.path.split(mx.name)
 677                         broken = False
 678                         for n in mx.idxnames:
 679                             if not os.path.exists(os.path.join(mxd, n)):
 680                                 log(('warning: index %s missing\n' +
 681                                     '  used by %s\n') % (n, mxf))
 682                                 broken = True
 683                         if broken:
 684                             del mx
 685                             unlink(full)
 686                         else:
 687                             midxl.append(mx)
 688                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 689                 for ix in midxl:
 690                     any_needed = False
 691                     for sub in ix.idxnames:
 692                         found = d.get(os.path.join(self.dir, sub))
 693                         if not found or isinstance(found, PackIdx):
 694                             # doesn't exist, or exists but not in a midx
 695                             any_needed = True
 696                             break
 697                     if any_needed:
 698                         d[ix.name] = ix
 699                         for name in ix.idxnames:
 700                             d[os.path.join(self.dir, name)] = ix
 701                     elif not ix.force_keep:
 702                         debug1('midx: removing redundant: %s\n'
 703                                % os.path.basename(ix.name))
 704                         unlink(ix.name)
 705             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 706                 if not d.get(full):
 707                     try:
 708                         ix = open_idx(full)
 709                     except GitError, e:
 710                         add_error(e)
 711                         continue
 712                     d[full] = ix
 713             bfull = os.path.join(self.dir, 'bup.bloom')
 714             if self.bloom is None and os.path.exists(bfull):
 715                 self.bloom = ShaBloom(bfull)
 716             self.packs = list(set(d.values()))
 717             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 718             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 719                 self.do_bloom = True
 720             else:
 721                 self.bloom = None
 722         debug1('PackIdxList: using %d index%s.\n'
 723             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 724
 725     def add(self, hash):
 726         """Insert an additional object in the list."""
 727         self.also.add(hash)
 728
 729
 730 def calc_hash(type, content):
 731     """Calculate some content's hash in the Git fashion."""
 732     header = '%s %d\0' % (type, len(content))
 733     sum = Sha1(header)
 734     sum.update(content)
 735     return sum.digest()
 736
 737
 738 def _shalist_sort_key(ent):
 739     (mode, name, id) = ent
 740     if stat.S_ISDIR(int(mode, 8)):
 741         return name + '/'
 742     else:
 743         return name
 744
 745
 746 def open_idx(filename):
 747     if filename.endswith('.idx'):
 748         f = open(filename, 'rb')
 749         header = f.read(8)
 750         if header[0:4] == '\377tOc':
 751             version = struct.unpack('!I', header[4:8])[0]
 752             if version == 2:
 753                 return PackIdxV2(filename, f)
 754             else:
 755                 raise GitError('%s: expected idx file version 2, got %d'
 756                                % (filename, version))
 757         elif len(header) == 8 and header[0:4] < '\377tOc':
 758             return PackIdxV1(filename, f)
 759         else:
 760             raise GitError('%s: unrecognized idx file header' % filename)
 761     elif filename.endswith('.midx'):
 762         return PackMidx(filename)
 763     else:
 764         raise GitError('idx filenames must end with .idx or .midx')
 765
 766
 767 def idxmerge(idxlist, final_progress=True):
 768     """Generate a list of all the objects reachable in a PackIdxList."""
 769     def pfunc(count, total):
 770         qprogress('Reading indexes: %.2f%% (%d/%d)\r'
 771                   % (count*100.0/total, count, total))
 772     def pfinal(count, total):
 773         if final_progress:
 774             progress('Reading indexes: %.2f%% (%d/%d), done.\n'
 775                      % (100, total, total))
 776     return merge_iter(idxlist, 10024, pfunc, pfinal)
 777
 778
 779 def _make_objcache():
 780     return PackIdxList(repo('objects/pack'))
 781
 782 class PackWriter:
 783     """Writes Git objects insid a pack file."""
 784     def __init__(self, objcache_maker=_make_objcache):
 785         self.count = 0
 786         self.outbytes = 0
 787         self.filename = None
 788         self.file = None
 789         self.idx = None
 790         self.objcache_maker = objcache_maker
 791         self.objcache = None
 792
 793     def __del__(self):
 794         self.close()
 795
 796     def _open(self):
 797         if not self.file:
 798             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 799             self.file = os.fdopen(fd, 'w+b')
 800             assert(name.endswith('.pack'))
 801             self.filename = name[:-5]
 802             self.file.write('PACK\0\0\0\2\0\0\0\0')
 803             self.idx = list(list() for i in xrange(256))
 804
 805     def _raw_write(self, datalist, sha):
 806         self._open()
 807         f = self.file
 808         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 809         # the file never has a *partial* blob.  So let's make sure it's
 810         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 811         # to our hashsplit algorithm.)  f.write() does its own buffering,
 812         # but that's okay because we'll flush it in _end().
 813         oneblob = ''.join(datalist)
 814         try:
 815             f.write(oneblob)
 816         except IOError, e:
 817             raise GitError, e, sys.exc_info()[2]
 818         nw = len(oneblob)
 819         crc = zlib.crc32(oneblob) & 0xffffffff
 820         self._update_idx(sha, crc, nw)
 821         self.outbytes += nw
 822         self.count += 1
 823         return nw, crc
 824
 825     def _update_idx(self, sha, crc, size):
 826         assert(sha)
 827         if self.idx:
 828             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 829
 830     def _write(self, sha, type, content):
 831         if verbose:
 832             log('>')
 833         if not sha:
 834             sha = calc_hash(type, content)
 835         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 836         return sha
 837
 838     def breakpoint(self):
 839         """Clear byte and object counts and return the last processed id."""
 840         id = self._end()
 841         self.outbytes = self.count = 0
 842         return id
 843
 844     def _require_objcache(self):
 845         if self.objcache is None and self.objcache_maker:
 846             self.objcache = self.objcache_maker()
 847         if self.objcache is None:
 848             raise GitError(
 849                     "PackWriter not opened or can't check exists w/o objcache")
 850
 851     def exists(self, id, want_source=False):
 852         """Return non-empty if an object is found in the object cache."""
 853         self._require_objcache()
 854         return self.objcache.exists(id, want_source=want_source)
 855
 856     def maybe_write(self, type, content):
 857         """Write an object to the pack file if not present and return its id."""
 858         self._require_objcache()
 859         sha = calc_hash(type, content)
 860         if not self.exists(sha):
 861             self._write(sha, type, content)
 862             self.objcache.add(sha)
 863         return sha
 864
 865     def new_blob(self, blob):
 866         """Create a blob object in the pack with the supplied content."""
 867         return self.maybe_write('blob', blob)
 868
 869     def new_tree(self, shalist):
 870         """Create a tree object in the pack."""
 871         shalist = sorted(shalist, key = _shalist_sort_key)
 872         l = []
 873         for (mode,name,bin) in shalist:
 874             assert(mode)
 875             assert(mode != '0')
 876             assert(mode[0] != '0')
 877             assert(name)
 878             assert(len(bin) == 20)
 879             l.append('%s %s\0%s' % (mode,name,bin))
 880         return self.maybe_write('tree', ''.join(l))
 881
 882     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 883         l = []
 884         if tree: l.append('tree %s' % tree.encode('hex'))
 885         if parent: l.append('parent %s' % parent.encode('hex'))
 886         if author: l.append('author %s %s' % (author, _git_date(adate)))
 887         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 888         l.append('')
 889         l.append(msg)
 890         return self.maybe_write('commit', '\n'.join(l))
 891
 892     def new_commit(self, parent, tree, date, msg):
 893         """Create a commit object in the pack."""
 894         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 895         commit = self._new_commit(tree, parent,
 896                                   userline, date, userline, date,
 897                                   msg)
 898         return commit
 899
 900     def abort(self):
 901         """Remove the pack file from disk."""
 902         f = self.file
 903         if f:
 904             self.idx = None
 905             self.file = None
 906             f.close()
 907             os.unlink(self.filename + '.pack')
 908
 909     def _end(self, run_midx=True):
 910         f = self.file
 911         if not f: return None
 912         self.file = None
 913         self.objcache = None
 914         idx = self.idx
 915         self.idx = None
 916
 917         # update object count
 918         f.seek(8)
 919         cp = struct.pack('!i', self.count)
 920         assert(len(cp) == 4)
 921         f.write(cp)
 922
 923         # calculate the pack sha1sum
 924         f.seek(0)
 925         sum = Sha1()
 926         for b in chunkyreader(f):
 927             sum.update(b)
 928         packbin = sum.digest()
 929         f.write(packbin)
 930         f.close()
 931
 932         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 933
 934         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 935         if os.path.exists(self.filename + '.map'):
 936             os.unlink(self.filename + '.map')
 937         os.rename(self.filename + '.pack', nameprefix + '.pack')
 938         os.rename(self.filename + '.idx', nameprefix + '.idx')
 939
 940         if run_midx:
 941             auto_midx(repo('objects/pack'))
 942         return nameprefix
 943
 944     def close(self, run_midx=True):
 945         """Close the pack file and move it to its definitive path."""
 946         return self._end(run_midx=run_midx)
 947
 948     def _write_pack_idx_v2(self, filename, idx, packbin):
 949         idx_f = open(filename, 'w+b')
 950         idx_f.write('\377tOc\0\0\0\2')
 951
 952         ofs64_ofs = 8 + 4*256 + 28*self.count
 953         idx_f.truncate(ofs64_ofs)
 954         idx_f.seek(0)
 955         idx_map = mmap_readwrite(idx_f, close=False)
 956         idx_f.seek(0, SEEK_END)
 957         count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
 958         assert(count == self.count)
 959         idx_map.close()
 960         idx_f.write(packbin)
 961
 962         idx_f.seek(0)
 963         idx_sum = Sha1()
 964         b = idx_f.read(8 + 4*256)
 965         idx_sum.update(b)
 966
 967         obj_list_sum = Sha1()
 968         for b in chunkyreader(idx_f, 20*self.count):
 969             idx_sum.update(b)
 970             obj_list_sum.update(b)
 971         namebase = obj_list_sum.hexdigest()
 972
 973         for b in chunkyreader(idx_f):
 974             idx_sum.update(b)
 975         idx_f.write(idx_sum.digest())
 976         idx_f.close()
 977
 978         return namebase
 979
 980
 981 def _git_date(date):
 982     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 983
 984
 985 def _gitenv():
 986     os.environ['GIT_DIR'] = os.path.abspath(repo())
 987
 988
 989 def list_refs(refname = None):
 990     """Generate a list of tuples in the form (refname,hash).
 991     If a ref name is specified, list only this particular ref.
 992     """
 993     argv = ['git', 'show-ref', '--']
 994     if refname:
 995         argv += [refname]
 996     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 997     out = p.stdout.read().strip()
 998     rv = p.wait()  # not fatal
 999     if rv:
1000         assert(not out)
1001     if out:
1002         for d in out.split('\n'):
1003             (sha, name) = d.split(' ', 1)
1004             yield (name, sha.decode('hex'))
1005
1006
1007 def read_ref(refname):
1008     """Get the commit id of the most recent commit made on a given ref."""
1009     l = list(list_refs(refname))
1010     if l:
1011         assert(len(l) == 1)
1012         return l[0][1]
1013     else:
1014         return None
1015
1016
1017 def rev_list(ref, count=None):
1018     """Generate a list of reachable commits in reverse chronological order.
1019
1020     This generator walks through commits, from child to parent, that are
1021     reachable via the specified ref and yields a series of tuples of the form
1022     (date,hash).
1023
1024     If count is a non-zero integer, limit the number of commits to "count"
1025     objects.
1026     """
1027     assert(not ref.startswith('-'))
1028     opts = []
1029     if count:
1030         opts += ['-n', str(atoi(count))]
1031     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
1032     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
1033     commit = None
1034     for row in p.stdout:
1035         s = row.strip()
1036         if s.startswith('commit '):
1037             commit = s[7:].decode('hex')
1038         else:
1039             date = int(s)
1040             yield (date, commit)
1041     rv = p.wait()  # not fatal
1042     if rv:
1043         raise GitError, 'git rev-list returned error %d' % rv
1044
1045
1046 def rev_get_date(ref):
1047     """Get the date of the latest commit on the specified ref."""
1048     for (date, commit) in rev_list(ref, count=1):
1049         return date
1050     raise GitError, 'no such commit %r' % ref
1051
1052
1053 def rev_parse(committish):
1054     """Resolve the full hash for 'committish', if it exists.
1055
1056     Should be roughly equivalent to 'git rev-parse'.
1057
1058     Returns the hex value of the hash if it is found, None if 'committish' does
1059     not correspond to anything.
1060     """
1061     head = read_ref(committish)
1062     if head:
1063         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1064         return head
1065
1066     pL = PackIdxList(repo('objects/pack'))
1067
1068     if len(committish) == 40:
1069         try:
1070             hash = committish.decode('hex')
1071         except TypeError:
1072             return None
1073
1074         if pL.exists(hash):
1075             return hash
1076
1077     return None
1078
1079
1080 def update_ref(refname, newval, oldval):
1081     """Change the commit pointed to by a branch."""
1082     if not oldval:
1083         oldval = ''
1084     assert(refname.startswith('refs/heads/'))
1085     p = subprocess.Popen(['git', 'update-ref', refname,
1086                           newval.encode('hex'), oldval.encode('hex')],
1087                          preexec_fn = _gitenv)
1088     _git_wait('git update-ref', p)
1089
1090
1091 def guess_repo(path=None):
1092     """Set the path value in the global variable "repodir".
1093     This makes bup look for an existing bup repository, but not fail if a
1094     repository doesn't exist. Usually, if you are interacting with a bup
1095     repository, you would not be calling this function but using
1096     check_repo_or_die().
1097     """
1098     global repodir
1099     if path:
1100         repodir = path
1101     if not repodir:
1102         repodir = os.environ.get('BUP_DIR')
1103         if not repodir:
1104             repodir = os.path.expanduser('~/.bup')
1105
1106
1107 def init_repo(path=None):
1108     """Create the Git bare repository for bup in a given path."""
1109     guess_repo(path)
1110     d = repo()  # appends a / to the path
1111     parent = os.path.dirname(os.path.dirname(d))
1112     if parent and not os.path.exists(parent):
1113         raise GitError('parent directory "%s" does not exist\n' % parent)
1114     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1115         raise GitError('"%d" exists but is not a directory\n' % d)
1116     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1117                          preexec_fn = _gitenv)
1118     _git_wait('git init', p)
1119     # Force the index version configuration in order to ensure bup works
1120     # regardless of the version of the installed Git binary.
1121     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1122                          stdout=sys.stderr, preexec_fn = _gitenv)
1123     _git_wait('git config', p)
1124
1125
1126 def check_repo_or_die(path=None):
1127     """Make sure a bup repository exists, and abort if not.
1128     If the path to a particular repository was not specified, this function
1129     initializes the default repository automatically.
1130     """
1131     guess_repo(path)
1132     if not os.path.isdir(repo('objects/pack/.')):
1133         if repodir == home_repodir:
1134             init_repo()
1135         else:
1136             log('error: %r is not a bup/git repository\n' % repo())
1137             sys.exit(15)
1138
1139
1140 def treeparse(buf):
1141     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1142     ofs = 0
1143     while ofs < len(buf):
1144         z = buf[ofs:].find('\0')
1145         assert(z > 0)
1146         spl = buf[ofs:ofs+z].split(' ', 1)
1147         assert(len(spl) == 2)
1148         sha = buf[ofs+z+1:ofs+z+1+20]
1149         ofs += z+1+20
1150         yield (spl[0], spl[1], sha)
1151
1152
1153 _ver = None
1154 def ver():
1155     """Get Git's version and ensure a usable version is installed.
1156
1157     The returned version is formatted as an ordered tuple with each position
1158     representing a digit in the version tag. For example, the following tuple
1159     would represent version 1.6.6.9:
1160
1161         ('1', '6', '6', '9')
1162     """
1163     global _ver
1164     if not _ver:
1165         p = subprocess.Popen(['git', '--version'],
1166                              stdout=subprocess.PIPE)
1167         gvs = p.stdout.read()
1168         _git_wait('git --version', p)
1169         m = re.match(r'git version (\S+.\S+)', gvs)
1170         if not m:
1171             raise GitError('git --version weird output: %r' % gvs)
1172         _ver = tuple(m.group(1).split('.'))
1173     needed = ('1','5', '3', '1')
1174     if _ver < needed:
1175         raise GitError('git version %s or higher is required; you have %s'
1176                        % ('.'.join(needed), '.'.join(_ver)))
1177     return _ver
1178
1179
1180 def _git_wait(cmd, p):
1181     rv = p.wait()
1182     if rv != 0:
1183         raise GitError('%s returned %d' % (cmd, rv))
1184
1185
1186 def _git_capture(argv):
1187     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1188     r = p.stdout.read()
1189     _git_wait(repr(argv), p)
1190     return r
1191
1192
1193 class _AbortableIter:
1194     def __init__(self, it, onabort = None):
1195         self.it = it
1196         self.onabort = onabort
1197         self.done = None
1198
1199     def __iter__(self):
1200         return self
1201
1202     def next(self):
1203         try:
1204             return self.it.next()
1205         except StopIteration, e:
1206             self.done = True
1207             raise
1208         except:
1209             self.abort()
1210             raise
1211
1212     def abort(self):
1213         """Abort iteration and call the abortion callback, if needed."""
1214         if not self.done:
1215             self.done = True
1216             if self.onabort:
1217                 self.onabort()
1218
1219     def __del__(self):
1220         self.abort()
1221
1222
1223 _ver_warned = 0
1224 class CatPipe:
1225     """Link to 'git cat-file' that is used to retrieve blob data."""
1226     def __init__(self):
1227         global _ver_warned
1228         wanted = ('1','5','6')
1229         if ver() < wanted:
1230             if not _ver_warned:
1231                 log('warning: git version < %s; bup will be slow.\n'
1232                     % '.'.join(wanted))
1233                 _ver_warned = 1
1234             self.get = self._slow_get
1235         else:
1236             self.p = self.inprogress = None
1237             self.get = self._fast_get
1238
1239     def _abort(self):
1240         if self.p:
1241             self.p.stdout.close()
1242             self.p.stdin.close()
1243         self.p = None
1244         self.inprogress = None
1245
1246     def _restart(self):
1247         self._abort()
1248         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1249                                   stdin=subprocess.PIPE,
1250                                   stdout=subprocess.PIPE,
1251                                   close_fds = True,
1252                                   bufsize = 4096,
1253                                   preexec_fn = _gitenv)
1254
1255     def _fast_get(self, id):
1256         if not self.p or self.p.poll() != None:
1257             self._restart()
1258         assert(self.p)
1259         assert(self.p.poll() == None)
1260         if self.inprogress:
1261             log('_fast_get: opening %r while %r is open'
1262                 % (id, self.inprogress))
1263         assert(not self.inprogress)
1264         assert(id.find('\n') < 0)
1265         assert(id.find('\r') < 0)
1266         assert(not id.startswith('-'))
1267         self.inprogress = id
1268         self.p.stdin.write('%s\n' % id)
1269         self.p.stdin.flush()
1270         hdr = self.p.stdout.readline()
1271         if hdr.endswith(' missing\n'):
1272             self.inprogress = None
1273             raise KeyError('blob %r is missing' % id)
1274         spl = hdr.split(' ')
1275         if len(spl) != 3 or len(spl[0]) != 40:
1276             raise GitError('expected blob, got %r' % spl)
1277         (hex, type, size) = spl
1278
1279         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1280                            onabort = self._abort)
1281         try:
1282             yield type
1283             for blob in it:
1284                 yield blob
1285             assert(self.p.stdout.readline() == '\n')
1286             self.inprogress = None
1287         except Exception, e:
1288             it.abort()
1289             raise
1290
1291     def _slow_get(self, id):
1292         assert(id.find('\n') < 0)
1293         assert(id.find('\r') < 0)
1294         assert(id[0] != '-')
1295         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1296         yield type
1297
1298         p = subprocess.Popen(['git', 'cat-file', type, id],
1299                              stdout=subprocess.PIPE,
1300                              preexec_fn = _gitenv)
1301         for blob in chunkyreader(p.stdout):
1302             yield blob
1303         _git_wait('git cat-file', p)
1304
1305     def _join(self, it):
1306         type = it.next()
1307         if type == 'blob':
1308             for blob in it:
1309                 yield blob
1310         elif type == 'tree':
1311             treefile = ''.join(it)
1312             for (mode, name, sha) in treeparse(treefile):
1313                 for blob in self.join(sha.encode('hex')):
1314                     yield blob
1315         elif type == 'commit':
1316             treeline = ''.join(it).split('\n')[0]
1317             assert(treeline.startswith('tree '))
1318             for blob in self.join(treeline[5:]):
1319                 yield blob
1320         else:
1321             raise GitError('invalid object type %r: expected blob/tree/commit'
1322                            % type)
1323
1324     def join(self, id):
1325         """Generate a list of the content of all blobs that can be reached
1326         from an object.  The hash given in 'id' must point to a blob, a tree
1327         or a commit. The content of all blobs that can be seen from trees or
1328         commits will be added to the list.
1329         """
1330         try:
1331             for d in self._join(self.get(id)):
1332                 yield d
1333         except StopIteration:
1334             log('booger!\n')
1335
1336 def tags():
1337     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1338     tags = {}
1339     for (n,c) in list_refs():
1340         if n.startswith('refs/tags/'):
1341             name = n[10:]
1342             if not c in tags:
1343                 tags[c] = []
1344
1345             tags[c].append(name)  # more than one tag can point at 'c'
1346
1347     return tags