lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path
   8
   9 MIDX_VERSION = 4
  10 SEEK_END=2  # os.SEEK_END is not defined in python 2.4
  11
  12 """Discussion of bloom constants for bup:
  13
  14 There are four basic things to consider when building a bloom filter:
  15 The size, in bits, of the filter
  16 The capacity, in entries, of the filter
  17 The probability of a false positive that is tolerable
  18 The number of bits readily available to use for addresing filter bits
  19
  20 There is one major tunable that is not directly related to the above:
  21 k: the number of bits set in the filter per entry
  22
  23 Here's a wall of numbers showing the relationship between k; the ratio between
  24 the filter size in bits and the entries in the filter; and pfalse_positive:
  25
  26 mn|k=3    |k=4    |k=5    |k=6    |k=7    |k=8    |k=9    |k=10   |k=11
  27  8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
  28  9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
  29 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
  30 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
  31 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
  32 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
  33 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
  34 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
  35 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
  36 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
  37 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
  38 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
  39 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
  40 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
  41 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
  42 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
  43 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
  44 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
  45 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
  46 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
  47 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
  48 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
  49 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
  50 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
  51 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
  52
  53 Here's a table showing available repository size for a given pfalse_positive
  54 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
  55 filter and 8192bytes per object):
  56
  57 pfalse|obj k=4     |cap k=4    |obj k=5  |cap k=5    |obj k=6 |cap k=6
  58 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
  59 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
  60 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
  61
  62 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
  63 addressing.
  64
  65 filter size scales linearly with repository size for a given k and pfalse.
  66
  67 Here's a table of filter sizes for a 1 TiB repository:
  68
  69 pfalse| k=3        | k=4        | k=5        | k=6
  70 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
  71 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
  72 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
  73
  74 For bup:
  75 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
  76 per lookup will be worse than the two required for midx.
  77 * We want the pfalse_positive to be low enough that the cost of sometimes
  78 faulting on the midx doesn't overcome the benefit of the bloom filter.
  79 * We have readily available 160 bits for addressing the filter.
  80 * We want to be able to have a single bloom address entire repositories of
  81 reasonable size.
  82
  83 Based on these parameters, a combination of k=4 and k=5 provides the behavior
  84 that bup needs.  As such, I've implemented bloom addressing, adding and
  85 checking functions in C for these two values.  Because k=5 requires less space
  86 and gives better overall pfalse_positive perofrmance, it is preferred if a
  87 table with k=5 can represent the repository.
  88
  89 None of this tells us what max_pfalse_positive to choose.
  90
  91 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
  92 """
  93 BLOOM_VERSION = 2
  94 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
  95 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
  96 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
  97
  98 verbose = 0
  99 ignore_midx = 0
 100 home_repodir = os.path.expanduser('~/.bup')
 101 repodir = None
 102
 103 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
 104 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
 105
 106 _total_searches = 0
 107 _total_steps = 0
 108
 109
 110 class GitError(Exception):
 111     pass
 112
 113
 114 def repo(sub = ''):
 115     """Get the path to the git repository or one of its subdirectories."""
 116     global repodir
 117     if not repodir:
 118         raise GitError('You should call check_repo_or_die()')
 119
 120     # If there's a .git subdirectory, then the actual repo is in there.
 121     gd = os.path.join(repodir, '.git')
 122     if os.path.exists(gd):
 123         repodir = gd
 124
 125     return os.path.join(repodir, sub)
 126
 127
 128 def repo_rel(path):
 129     full = os.path.abspath(path)
 130     fullrepo = os.path.abspath(repo(''))
 131     if not fullrepo.endswith('/'):
 132         fullrepo += '/'
 133     if full.startswith(fullrepo):
 134         path = full[len(fullrepo):]
 135     if path.startswith('index-cache/'):
 136         path = path[len('index-cache/'):]
 137     return path
 138
 139
 140 def all_packdirs():
 141     paths = [repo('objects/pack')]
 142     paths += glob.glob(repo('index-cache/*/.'))
 143     return paths
 144
 145
 146 def auto_midx(objdir):
 147     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 148     try:
 149         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 150     except OSError, e:
 151         # make sure 'args' gets printed to help with debugging
 152         add_error('%r: exception: %s' % (args, e))
 153         raise
 154     if rv:
 155         add_error('%r: returned %d' % (args, rv))
 156
 157     args = [path.exe(), 'bloom', '--dir', objdir]
 158     try:
 159         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 160     except OSError, e:
 161         # make sure 'args' gets printed to help with debugging
 162         add_error('%r: exception: %s' % (args, e))
 163         raise
 164     if rv:
 165         add_error('%r: returned %d' % (args, rv))
 166
 167
 168 def mangle_name(name, mode, gitmode):
 169     """Mangle a file name to present an abstract name for segmented files.
 170     Mangled file names will have the ".bup" extension added to them. If a
 171     file's name already ends with ".bup", a ".bupl" extension is added to
 172     disambiguate normal files from semgmented ones.
 173     """
 174     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 175         return name + '.bup'
 176     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 177         return name + '.bupl'
 178     else:
 179         return name
 180
 181
 182 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 183 def demangle_name(name):
 184     """Remove name mangling from a file name, if necessary.
 185
 186     The return value is a tuple (demangled_filename,mode), where mode is one of
 187     the following:
 188
 189     * BUP_NORMAL  : files that should be read as-is from the repository
 190     * BUP_CHUNKED : files that were chunked and need to be assembled
 191
 192     For more information on the name mangling algorythm, see mangle_name()
 193     """
 194     if name.endswith('.bupl'):
 195         return (name[:-5], BUP_NORMAL)
 196     elif name.endswith('.bup'):
 197         return (name[:-4], BUP_CHUNKED)
 198     else:
 199         return (name, BUP_NORMAL)
 200
 201
 202 def _encode_packobj(type, content):
 203     szout = ''
 204     sz = len(content)
 205     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 206     sz >>= 4
 207     while 1:
 208         if sz: szbits |= 0x80
 209         szout += chr(szbits)
 210         if not sz:
 211             break
 212         szbits = sz & 0x7f
 213         sz >>= 7
 214     z = zlib.compressobj(1)
 215     yield szout
 216     yield z.compress(content)
 217     yield z.flush()
 218
 219
 220 def _encode_looseobj(type, content):
 221     z = zlib.compressobj(1)
 222     yield z.compress('%s %d\0' % (type, len(content)))
 223     yield z.compress(content)
 224     yield z.flush()
 225
 226
 227 def _decode_looseobj(buf):
 228     assert(buf);
 229     s = zlib.decompress(buf)
 230     i = s.find('\0')
 231     assert(i > 0)
 232     l = s[:i].split(' ')
 233     type = l[0]
 234     sz = int(l[1])
 235     content = s[i+1:]
 236     assert(type in _typemap)
 237     assert(sz == len(content))
 238     return (type, content)
 239
 240
 241 def _decode_packobj(buf):
 242     assert(buf)
 243     c = ord(buf[0])
 244     type = _typermap[(c & 0x70) >> 4]
 245     sz = c & 0x0f
 246     shift = 4
 247     i = 0
 248     while c & 0x80:
 249         i += 1
 250         c = ord(buf[i])
 251         sz |= (c & 0x7f) << shift
 252         shift += 7
 253         if not (c & 0x80):
 254             break
 255     return (type, zlib.decompress(buf[i+1:]))
 256
 257
 258 class PackIdx:
 259     def __init__(self):
 260         assert(0)
 261
 262     def find_offset(self, hash):
 263         """Get the offset of an object inside the index file."""
 264         idx = self._idx_from_hash(hash)
 265         if idx != None:
 266             return self._ofs_from_idx(idx)
 267         return None
 268
 269     def exists(self, hash, want_source=False):
 270         """Return nonempty if the object exists in this index."""
 271         if hash and (self._idx_from_hash(hash) != None):
 272             return want_source and os.path.basename(self.name) or True
 273         return None
 274
 275     def __len__(self):
 276         return int(self.fanout[255])
 277
 278     def _idx_from_hash(self, hash):
 279         global _total_searches, _total_steps
 280         _total_searches += 1
 281         assert(len(hash) == 20)
 282         b1 = ord(hash[0])
 283         start = self.fanout[b1-1] # range -1..254
 284         end = self.fanout[b1] # range 0..255
 285         want = str(hash)
 286         _total_steps += 1  # lookup table is a step
 287         while start < end:
 288             _total_steps += 1
 289             mid = start + (end-start)/2
 290             v = self._idx_to_hash(mid)
 291             if v < want:
 292                 start = mid+1
 293             elif v > want:
 294                 end = mid
 295             else: # got it!
 296                 return mid
 297         return None
 298
 299
 300 class PackIdxV1(PackIdx):
 301     """Object representation of a Git pack index (version 1) file."""
 302     def __init__(self, filename, f):
 303         self.name = filename
 304         self.idxnames = [self.name]
 305         self.map = mmap_read(f)
 306         self.fanout = list(struct.unpack('!256I',
 307                                          str(buffer(self.map, 0, 256*4))))
 308         self.fanout.append(0)  # entry "-1"
 309         nsha = self.fanout[255]
 310         self.sha_ofs = 256*4
 311         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 312
 313     def _ofs_from_idx(self, idx):
 314         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 315
 316     def _idx_to_hash(self, idx):
 317         return str(self.shatable[idx*24+4 : idx*24+24])
 318
 319     def __iter__(self):
 320         for i in xrange(self.fanout[255]):
 321             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 322
 323
 324 class PackIdxV2(PackIdx):
 325     """Object representation of a Git pack index (version 2) file."""
 326     def __init__(self, filename, f):
 327         self.name = filename
 328         self.idxnames = [self.name]
 329         self.map = mmap_read(f)
 330         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 331         self.fanout = list(struct.unpack('!256I',
 332                                          str(buffer(self.map, 8, 256*4))))
 333         self.fanout.append(0)  # entry "-1"
 334         nsha = self.fanout[255]
 335         self.sha_ofs = 8 + 256*4
 336         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 337         self.ofstable = buffer(self.map,
 338                                self.sha_ofs + nsha*20 + nsha*4,
 339                                nsha*4)
 340         self.ofs64table = buffer(self.map,
 341                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 342
 343     def _ofs_from_idx(self, idx):
 344         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 345         if ofs & 0x80000000:
 346             idx64 = ofs & 0x7fffffff
 347             ofs = struct.unpack('!Q',
 348                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 349         return ofs
 350
 351     def _idx_to_hash(self, idx):
 352         return str(self.shatable[idx*20:(idx+1)*20])
 353
 354     def __iter__(self):
 355         for i in xrange(self.fanout[255]):
 356             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 357
 358
 359 extract_bits = _helpers.extract_bits
 360
 361 bloom_contains = _helpers.bloom_contains
 362 bloom_add = _helpers.bloom_add
 363
 364
 365 class ShaBloom:
 366     """Wrapper which contains data from multiple index files.
 367     """
 368     def __init__(self, filename, f=None, readwrite=False, expected=-1):
 369         self.name = filename
 370         self.rwfile = None
 371         self.map = None
 372         assert(filename.endswith('.bloom'))
 373         if readwrite:
 374             assert(expected > 0)
 375             self.rwfile = f = f or open(filename, 'r+b')
 376             f.seek(0)
 377
 378             # Decide if we want to mmap() the pages as writable ('immediate'
 379             # write) or else map them privately for later writing back to
 380             # the file ('delayed' write).  A bloom table's write access
 381             # pattern is such that we dirty almost all the pages after adding
 382             # very few entries.  But the table is so big that dirtying
 383             # *all* the pages often exceeds Linux's default
 384             # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
 385             # thus causing it to start flushing the table before we're
 386             # finished... even though there's more than enough space to
 387             # store the bloom table in RAM.
 388             #
 389             # To work around that behaviour, if we calculate that we'll
 390             # probably end up touching the whole table anyway (at least
 391             # one bit flipped per memory page), let's use a "private" mmap,
 392             # which defeats Linux's ability to flush it to disk.  Then we'll
 393             # flush it as one big lump during close().
 394             pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
 395             self.delaywrite = expected > pages
 396             debug1('bloom: delaywrite=%r\n' % self.delaywrite)
 397             if self.delaywrite:
 398                 self.map = mmap_readwrite_private(self.rwfile, close=False)
 399             else:
 400                 self.map = mmap_readwrite(self.rwfile, close=False)
 401         else:
 402             self.rwfile = None
 403             f = f or open(filename, 'rb')
 404             self.map = mmap_read(f)
 405         got = str(self.map[0:4])
 406         if got != 'BLOM':
 407             log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
 408             return self._init_failed()
 409         ver = struct.unpack('!I', self.map[4:8])[0]
 410         if ver < BLOOM_VERSION:
 411             log('Warning: ignoring old-style (v%d) bloom %r\n'
 412                 % (ver, filename))
 413             return self._init_failed()
 414         if ver > BLOOM_VERSION:
 415             log('Warning: ignoring too-new (v%d) bloom %r\n'
 416                 % (ver, filename))
 417             return self._init_failed()
 418
 419         self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
 420         idxnamestr = str(self.map[16 + 2**self.bits:])
 421         if idxnamestr:
 422             self.idxnames = idxnamestr.split('\0')
 423         else:
 424             self.idxnames = []
 425
 426     def _init_failed(self):
 427         if self.map:
 428             self.map = None
 429         if self.rwfile:
 430             self.rwfile.close()
 431             self.rwfile = None
 432         self.idxnames = []
 433         self.bits = self.entries = 0
 434
 435     def valid(self):
 436         return self.map and self.bits
 437
 438     def __del__(self):
 439         self.close()
 440
 441     def close(self):
 442         if self.map and self.rwfile:
 443             debug2("bloom: closing with %d entries\n" % self.entries)
 444             self.map[12:16] = struct.pack('!I', self.entries)
 445             if self.delaywrite:
 446                 self.rwfile.seek(0)
 447                 self.rwfile.write(self.map)
 448             else:
 449                 self.map.flush()
 450             self.rwfile.seek(16 + 2**self.bits)
 451             if self.idxnames:
 452                 self.rwfile.write('\0'.join(self.idxnames))
 453         self._init_failed()
 454
 455     def pfalse_positive(self, additional=0):
 456         n = self.entries + additional
 457         m = 8*2**self.bits
 458         k = self.k
 459         return 100*(1-math.exp(-k*float(n)/m))**k
 460
 461     def add_idx(self, ix):
 462         """Add the object to the filter, return current pfalse_positive."""
 463         if not self.map: raise Exception, "Cannot add to closed bloom"
 464         self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
 465         self.idxnames.append(os.path.basename(ix.name))
 466
 467     def exists(self, sha):
 468         """Return nonempty if the object probably exists in the bloom filter."""
 469         global _total_searches, _total_steps
 470         _total_searches += 1
 471         if not self.map: return None
 472         found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
 473         _total_steps += steps
 474         return found
 475
 476     @classmethod
 477     def create(cls, name, expected, delaywrite=None, f=None, k=None):
 478         """Create and return a bloom filter for `expected` entries."""
 479         bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
 480         k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
 481         if bits > MAX_BLOOM_BITS[k]:
 482             log('bloom: warning, max bits exceeded, non-optimal\n')
 483             bits = MAX_BLOOM_BITS[k]
 484         debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
 485         f = f or open(name, 'w+b')
 486         f.write('BLOM')
 487         f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
 488         assert(f.tell() == 16)
 489         # NOTE: On some systems this will not extend+zerofill, but it does on
 490         # darwin, linux, bsd and solaris.
 491         f.truncate(16+2**bits)
 492         f.seek(0)
 493         if delaywrite != None and not delaywrite:
 494             # tell it to expect very few objects, forcing a direct mmap
 495             expected = 1
 496         return cls(name, f=f, readwrite=True, expected=expected)
 497
 498     def __len__(self):
 499         return int(self.entries)
 500
 501
 502 class PackMidx:
 503     """Wrapper which contains data from multiple index files.
 504     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 505     and make it possible for bup to expand Git's indexing capabilities to vast
 506     amounts of files.
 507     """
 508     def __init__(self, filename):
 509         self.name = filename
 510         self.force_keep = False
 511         assert(filename.endswith('.midx'))
 512         self.map = mmap_read(open(filename))
 513         if str(self.map[0:4]) != 'MIDX':
 514             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 515             self.force_keep = True
 516             return self._init_failed()
 517         ver = struct.unpack('!I', self.map[4:8])[0]
 518         if ver < MIDX_VERSION:
 519             log('Warning: ignoring old-style (v%d) midx %r\n'
 520                 % (ver, filename))
 521             self.force_keep = False  # old stuff is boring
 522             return self._init_failed()
 523         if ver > MIDX_VERSION:
 524             log('Warning: ignoring too-new (v%d) midx %r\n'
 525                 % (ver, filename))
 526             self.force_keep = True  # new stuff is exciting
 527             return self._init_failed()
 528
 529         self.bits = _helpers.firstword(self.map[8:12])
 530         self.entries = 2**self.bits
 531         self.fanout = buffer(self.map, 12, self.entries*4)
 532         self.sha_ofs = 12 + self.entries*4
 533         self.nsha = nsha = self._fanget(self.entries-1)
 534         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 535         self.which_ofs = self.sha_ofs + 20*nsha
 536         self.whichlist = buffer(self.map, self.which_ofs, nsha*4)
 537         self.idxnames = str(self.map[self.which_ofs + 4*nsha:]).split('\0')
 538
 539     def _init_failed(self):
 540         self.bits = 0
 541         self.entries = 1
 542         self.fanout = buffer('\0\0\0\0')
 543         self.shatable = buffer('\0'*20)
 544         self.idxnames = []
 545
 546     def _fanget(self, i):
 547         start = i*4
 548         s = self.fanout[start:start+4]
 549         return _helpers.firstword(s)
 550
 551     def _get(self, i):
 552         return str(self.shatable[i*20:(i+1)*20])
 553
 554     def _get_idx_i(self, i):
 555         return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
 556
 557     def _get_idxname(self, i):
 558         return self.idxnames[self._get_idx_i(i)]
 559
 560     def exists(self, hash, want_source=False):
 561         """Return nonempty if the object exists in the index files."""
 562         global _total_searches, _total_steps
 563         _total_searches += 1
 564         want = str(hash)
 565         el = extract_bits(want, self.bits)
 566         if el:
 567             start = self._fanget(el-1)
 568             startv = el << (32-self.bits)
 569         else:
 570             start = 0
 571             startv = 0
 572         end = self._fanget(el)
 573         endv = (el+1) << (32-self.bits)
 574         _total_steps += 1   # lookup table is a step
 575         hashv = _helpers.firstword(hash)
 576         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 577         while start < end:
 578             _total_steps += 1
 579             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 580             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 581             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 582             v = self._get(mid)
 583             #print '    %08x' % self._num(v)
 584             if v < want:
 585                 start = mid+1
 586                 startv = _helpers.firstword(v)
 587             elif v > want:
 588                 end = mid
 589                 endv = _helpers.firstword(v)
 590             else: # got it!
 591                 return want_source and self._get_idxname(mid) or True
 592         return None
 593
 594     def __iter__(self):
 595         for i in xrange(self._fanget(self.entries-1)):
 596             yield buffer(self.shatable, i*20, 20)
 597
 598     def __len__(self):
 599         return int(self._fanget(self.entries-1))
 600
 601
 602 _mpi_count = 0
 603 class PackIdxList:
 604     def __init__(self, dir):
 605         global _mpi_count
 606         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 607         _mpi_count += 1
 608         self.dir = dir
 609         self.also = set()
 610         self.packs = []
 611         self.do_bloom = False
 612         self.bloom = None
 613         self.refresh()
 614
 615     def __del__(self):
 616         global _mpi_count
 617         _mpi_count -= 1
 618         assert(_mpi_count == 0)
 619
 620     def __iter__(self):
 621         return iter(idxmerge(self.packs))
 622
 623     def __len__(self):
 624         return sum(len(pack) for pack in self.packs)
 625
 626     def exists(self, hash, want_source=False):
 627         """Return nonempty if the object exists in the index files."""
 628         global _total_searches
 629         _total_searches += 1
 630         if hash in self.also:
 631             return True
 632         if self.do_bloom and self.bloom is not None:
 633             _total_searches -= 1  # will be incremented by bloom
 634             if self.bloom.exists(hash):
 635                 self.do_bloom = False
 636             else:
 637                 return None
 638         for i in xrange(len(self.packs)):
 639             p = self.packs[i]
 640             _total_searches -= 1  # will be incremented by sub-pack
 641             ix = p.exists(hash, want_source=want_source)
 642             if ix:
 643                 # reorder so most recently used packs are searched first
 644                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 645                 return ix
 646         self.do_bloom = True
 647         return None
 648
 649     def refresh(self, skip_midx = False):
 650         """Refresh the index list.
 651         This method verifies if .midx files were superseded (e.g. all of its
 652         contents are in another, bigger .midx file) and removes the superseded
 653         files.
 654
 655         If skip_midx is True, all work on .midx files will be skipped and .midx
 656         files will be removed from the list.
 657
 658         The module-global variable 'ignore_midx' can force this function to
 659         always act as if skip_midx was True.
 660         """
 661         self.bloom = None # Always reopen the bloom as it may have been relaced
 662         self.do_bloom = False
 663         skip_midx = skip_midx or ignore_midx
 664         d = dict((p.name, p) for p in self.packs
 665                  if not skip_midx or not isinstance(p, PackMidx))
 666         if os.path.exists(self.dir):
 667             if not skip_midx:
 668                 midxl = []
 669                 for ix in self.packs:
 670                     if isinstance(ix, PackMidx):
 671                         for name in ix.idxnames:
 672                             d[os.path.join(self.dir, name)] = ix
 673                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 674                     if not d.get(full):
 675                         mx = PackMidx(full)
 676                         (mxd, mxf) = os.path.split(mx.name)
 677                         broken = False
 678                         for n in mx.idxnames:
 679                             if not os.path.exists(os.path.join(mxd, n)):
 680                                 log(('warning: index %s missing\n' +
 681                                     '  used by %s\n') % (n, mxf))
 682                                 broken = True
 683                         if broken:
 684                             del mx
 685                             unlink(full)
 686                         else:
 687                             midxl.append(mx)
 688                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 689                 for ix in midxl:
 690                     any_needed = False
 691                     for sub in ix.idxnames:
 692                         found = d.get(os.path.join(self.dir, sub))
 693                         if not found or isinstance(found, PackIdx):
 694                             # doesn't exist, or exists but not in a midx
 695                             any_needed = True
 696                             break
 697                     if any_needed:
 698                         d[ix.name] = ix
 699                         for name in ix.idxnames:
 700                             d[os.path.join(self.dir, name)] = ix
 701                     elif not ix.force_keep:
 702                         debug1('midx: removing redundant: %s\n'
 703                                % os.path.basename(ix.name))
 704                         unlink(ix.name)
 705             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 706                 if not d.get(full):
 707                     try:
 708                         ix = open_idx(full)
 709                     except GitError, e:
 710                         add_error(e)
 711                         continue
 712                     d[full] = ix
 713             bfull = os.path.join(self.dir, 'bup.bloom')
 714             if self.bloom is None and os.path.exists(bfull):
 715                 self.bloom = ShaBloom(bfull)
 716             self.packs = list(set(d.values()))
 717             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 718             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 719                 self.do_bloom = True
 720             else:
 721                 self.bloom = None
 722         debug1('PackIdxList: using %d index%s.\n'
 723             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 724
 725     def add(self, hash):
 726         """Insert an additional object in the list."""
 727         self.also.add(hash)
 728
 729
 730 def calc_hash(type, content):
 731     """Calculate some content's hash in the Git fashion."""
 732     header = '%s %d\0' % (type, len(content))
 733     sum = Sha1(header)
 734     sum.update(content)
 735     return sum.digest()
 736
 737
 738 def _shalist_sort_key(ent):
 739     (mode, name, id) = ent
 740     if stat.S_ISDIR(int(mode, 8)):
 741         return name + '/'
 742     else:
 743         return name
 744
 745
 746 def open_idx(filename):
 747     if filename.endswith('.idx'):
 748         f = open(filename, 'rb')
 749         header = f.read(8)
 750         if header[0:4] == '\377tOc':
 751             version = struct.unpack('!I', header[4:8])[0]
 752             if version == 2:
 753                 return PackIdxV2(filename, f)
 754             else:
 755                 raise GitError('%s: expected idx file version 2, got %d'
 756                                % (filename, version))
 757         elif len(header) == 8 and header[0:4] < '\377tOc':
 758             return PackIdxV1(filename, f)
 759         else:
 760             raise GitError('%s: unrecognized idx file header' % filename)
 761     elif filename.endswith('.midx'):
 762         return PackMidx(filename)
 763     else:
 764         raise GitError('idx filenames must end with .idx or .midx')
 765
 766
 767 def idxmerge(idxlist, final_progress=True):
 768     """Generate a list of all the objects reachable in a PackIdxList."""
 769     def pfunc(count, total):
 770         progress('Reading indexes: %.2f%% (%d/%d)\r'
 771                  % (count*100.0/total, count, total))
 772     def pfinal(count, total):
 773         if final_progress:
 774             log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 775     return merge_iter(idxlist, 10024, pfunc, pfinal)
 776
 777
 778 def _make_objcache():
 779     return PackIdxList(repo('objects/pack'))
 780
 781 class PackWriter:
 782     """Writes Git objects insid a pack file."""
 783     def __init__(self, objcache_maker=_make_objcache):
 784         self.count = 0
 785         self.outbytes = 0
 786         self.filename = None
 787         self.file = None
 788         self.idx = None
 789         self.objcache_maker = objcache_maker
 790         self.objcache = None
 791
 792     def __del__(self):
 793         self.close()
 794
 795     def _open(self):
 796         if not self.file:
 797             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 798             self.file = os.fdopen(fd, 'w+b')
 799             assert(name.endswith('.pack'))
 800             self.filename = name[:-5]
 801             self.file.write('PACK\0\0\0\2\0\0\0\0')
 802             self.idx = list(list() for i in xrange(256))
 803
 804     def _raw_write(self, datalist, sha):
 805         self._open()
 806         f = self.file
 807         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 808         # the file never has a *partial* blob.  So let's make sure it's
 809         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 810         # to our hashsplit algorithm.)  f.write() does its own buffering,
 811         # but that's okay because we'll flush it in _end().
 812         oneblob = ''.join(datalist)
 813         try:
 814             f.write(oneblob)
 815         except IOError, e:
 816             raise GitError, e, sys.exc_info()[2]
 817         nw = len(oneblob)
 818         crc = zlib.crc32(oneblob) & 0xffffffff
 819         self._update_idx(sha, crc, nw)
 820         self.outbytes += nw
 821         self.count += 1
 822         return nw, crc
 823
 824     def _update_idx(self, sha, crc, size):
 825         assert(sha)
 826         if self.idx:
 827             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 828
 829     def _write(self, sha, type, content):
 830         if verbose:
 831             log('>')
 832         if not sha:
 833             sha = calc_hash(type, content)
 834         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 835         return sha
 836
 837     def breakpoint(self):
 838         """Clear byte and object counts and return the last processed id."""
 839         id = self._end()
 840         self.outbytes = self.count = 0
 841         return id
 842
 843     def _require_objcache(self):
 844         if self.objcache is None and self.objcache_maker:
 845             self.objcache = self.objcache_maker()
 846         if self.objcache is None:
 847             raise GitError(
 848                     "PackWriter not opened or can't check exists w/o objcache")
 849
 850     def exists(self, id, want_source=False):
 851         """Return non-empty if an object is found in the object cache."""
 852         self._require_objcache()
 853         return self.objcache.exists(id, want_source=want_source)
 854
 855     def maybe_write(self, type, content):
 856         """Write an object to the pack file if not present and return its id."""
 857         self._require_objcache()
 858         sha = calc_hash(type, content)
 859         if not self.exists(sha):
 860             self._write(sha, type, content)
 861             self.objcache.add(sha)
 862         return sha
 863
 864     def new_blob(self, blob):
 865         """Create a blob object in the pack with the supplied content."""
 866         return self.maybe_write('blob', blob)
 867
 868     def new_tree(self, shalist):
 869         """Create a tree object in the pack."""
 870         shalist = sorted(shalist, key = _shalist_sort_key)
 871         l = []
 872         for (mode,name,bin) in shalist:
 873             assert(mode)
 874             assert(mode != '0')
 875             assert(mode[0] != '0')
 876             assert(name)
 877             assert(len(bin) == 20)
 878             l.append('%s %s\0%s' % (mode,name,bin))
 879         return self.maybe_write('tree', ''.join(l))
 880
 881     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 882         l = []
 883         if tree: l.append('tree %s' % tree.encode('hex'))
 884         if parent: l.append('parent %s' % parent.encode('hex'))
 885         if author: l.append('author %s %s' % (author, _git_date(adate)))
 886         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 887         l.append('')
 888         l.append(msg)
 889         return self.maybe_write('commit', '\n'.join(l))
 890
 891     def new_commit(self, parent, tree, date, msg):
 892         """Create a commit object in the pack."""
 893         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 894         commit = self._new_commit(tree, parent,
 895                                   userline, date, userline, date,
 896                                   msg)
 897         return commit
 898
 899     def abort(self):
 900         """Remove the pack file from disk."""
 901         f = self.file
 902         if f:
 903             self.idx = None
 904             self.file = None
 905             f.close()
 906             os.unlink(self.filename + '.pack')
 907
 908     def _end(self, run_midx=True):
 909         f = self.file
 910         if not f: return None
 911         self.file = None
 912         self.objcache = None
 913         idx = self.idx
 914         self.idx = None
 915
 916         # update object count
 917         f.seek(8)
 918         cp = struct.pack('!i', self.count)
 919         assert(len(cp) == 4)
 920         f.write(cp)
 921
 922         # calculate the pack sha1sum
 923         f.seek(0)
 924         sum = Sha1()
 925         for b in chunkyreader(f):
 926             sum.update(b)
 927         packbin = sum.digest()
 928         f.write(packbin)
 929         f.close()
 930
 931         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 932
 933         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 934         if os.path.exists(self.filename + '.map'):
 935             os.unlink(self.filename + '.map')
 936         os.rename(self.filename + '.pack', nameprefix + '.pack')
 937         os.rename(self.filename + '.idx', nameprefix + '.idx')
 938
 939         if run_midx:
 940             auto_midx(repo('objects/pack'))
 941         return nameprefix
 942
 943     def close(self, run_midx=True):
 944         """Close the pack file and move it to its definitive path."""
 945         return self._end(run_midx=run_midx)
 946
 947     def _write_pack_idx_v2(self, filename, idx, packbin):
 948         idx_f = open(filename, 'w+b')
 949         idx_f.write('\377tOc\0\0\0\2')
 950
 951         ofs64_ofs = 8 + 4*256 + 28*self.count
 952         idx_f.truncate(ofs64_ofs)
 953         idx_f.seek(0)
 954         idx_map = mmap_readwrite(idx_f, close=False)
 955         idx_f.seek(0, SEEK_END)
 956         count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
 957         assert(count == self.count)
 958         idx_map.close()
 959         idx_f.write(packbin)
 960
 961         idx_f.seek(0)
 962         idx_sum = Sha1()
 963         b = idx_f.read(8 + 4*256)
 964         idx_sum.update(b)
 965
 966         obj_list_sum = Sha1()
 967         for b in chunkyreader(idx_f, 20*self.count):
 968             idx_sum.update(b)
 969             obj_list_sum.update(b)
 970         namebase = obj_list_sum.hexdigest()
 971
 972         for b in chunkyreader(idx_f):
 973             idx_sum.update(b)
 974         idx_f.write(idx_sum.digest())
 975         idx_f.close()
 976
 977         return namebase
 978
 979
 980 def _git_date(date):
 981     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 982
 983
 984 def _gitenv():
 985     os.environ['GIT_DIR'] = os.path.abspath(repo())
 986
 987
 988 def list_refs(refname = None):
 989     """Generate a list of tuples in the form (refname,hash).
 990     If a ref name is specified, list only this particular ref.
 991     """
 992     argv = ['git', 'show-ref', '--']
 993     if refname:
 994         argv += [refname]
 995     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 996     out = p.stdout.read().strip()
 997     rv = p.wait()  # not fatal
 998     if rv:
 999         assert(not out)
1000     if out:
1001         for d in out.split('\n'):
1002             (sha, name) = d.split(' ', 1)
1003             yield (name, sha.decode('hex'))
1004
1005
1006 def read_ref(refname):
1007     """Get the commit id of the most recent commit made on a given ref."""
1008     l = list(list_refs(refname))
1009     if l:
1010         assert(len(l) == 1)
1011         return l[0][1]
1012     else:
1013         return None
1014
1015
1016 def rev_list(ref, count=None):
1017     """Generate a list of reachable commits in reverse chronological order.
1018
1019     This generator walks through commits, from child to parent, that are
1020     reachable via the specified ref and yields a series of tuples of the form
1021     (date,hash).
1022
1023     If count is a non-zero integer, limit the number of commits to "count"
1024     objects.
1025     """
1026     assert(not ref.startswith('-'))
1027     opts = []
1028     if count:
1029         opts += ['-n', str(atoi(count))]
1030     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
1031     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
1032     commit = None
1033     for row in p.stdout:
1034         s = row.strip()
1035         if s.startswith('commit '):
1036             commit = s[7:].decode('hex')
1037         else:
1038             date = int(s)
1039             yield (date, commit)
1040     rv = p.wait()  # not fatal
1041     if rv:
1042         raise GitError, 'git rev-list returned error %d' % rv
1043
1044
1045 def rev_get_date(ref):
1046     """Get the date of the latest commit on the specified ref."""
1047     for (date, commit) in rev_list(ref, count=1):
1048         return date
1049     raise GitError, 'no such commit %r' % ref
1050
1051
1052 def rev_parse(committish):
1053     """Resolve the full hash for 'committish', if it exists.
1054
1055     Should be roughly equivalent to 'git rev-parse'.
1056
1057     Returns the hex value of the hash if it is found, None if 'committish' does
1058     not correspond to anything.
1059     """
1060     head = read_ref(committish)
1061     if head:
1062         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1063         return head
1064
1065     pL = PackIdxList(repo('objects/pack'))
1066
1067     if len(committish) == 40:
1068         try:
1069             hash = committish.decode('hex')
1070         except TypeError:
1071             return None
1072
1073         if pL.exists(hash):
1074             return hash
1075
1076     return None
1077
1078
1079 def update_ref(refname, newval, oldval):
1080     """Change the commit pointed to by a branch."""
1081     if not oldval:
1082         oldval = ''
1083     assert(refname.startswith('refs/heads/'))
1084     p = subprocess.Popen(['git', 'update-ref', refname,
1085                           newval.encode('hex'), oldval.encode('hex')],
1086                          preexec_fn = _gitenv)
1087     _git_wait('git update-ref', p)
1088
1089
1090 def guess_repo(path=None):
1091     """Set the path value in the global variable "repodir".
1092     This makes bup look for an existing bup repository, but not fail if a
1093     repository doesn't exist. Usually, if you are interacting with a bup
1094     repository, you would not be calling this function but using
1095     check_repo_or_die().
1096     """
1097     global repodir
1098     if path:
1099         repodir = path
1100     if not repodir:
1101         repodir = os.environ.get('BUP_DIR')
1102         if not repodir:
1103             repodir = os.path.expanduser('~/.bup')
1104
1105
1106 def init_repo(path=None):
1107     """Create the Git bare repository for bup in a given path."""
1108     guess_repo(path)
1109     d = repo()  # appends a / to the path
1110     parent = os.path.dirname(os.path.dirname(d))
1111     if parent and not os.path.exists(parent):
1112         raise GitError('parent directory "%s" does not exist\n' % parent)
1113     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1114         raise GitError('"%d" exists but is not a directory\n' % d)
1115     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1116                          preexec_fn = _gitenv)
1117     _git_wait('git init', p)
1118     # Force the index version configuration in order to ensure bup works
1119     # regardless of the version of the installed Git binary.
1120     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1121                          stdout=sys.stderr, preexec_fn = _gitenv)
1122     _git_wait('git config', p)
1123
1124
1125 def check_repo_or_die(path=None):
1126     """Make sure a bup repository exists, and abort if not.
1127     If the path to a particular repository was not specified, this function
1128     initializes the default repository automatically.
1129     """
1130     guess_repo(path)
1131     if not os.path.isdir(repo('objects/pack/.')):
1132         if repodir == home_repodir:
1133             init_repo()
1134         else:
1135             log('error: %r is not a bup/git repository\n' % repo())
1136             sys.exit(15)
1137
1138
1139 def treeparse(buf):
1140     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1141     ofs = 0
1142     while ofs < len(buf):
1143         z = buf[ofs:].find('\0')
1144         assert(z > 0)
1145         spl = buf[ofs:ofs+z].split(' ', 1)
1146         assert(len(spl) == 2)
1147         sha = buf[ofs+z+1:ofs+z+1+20]
1148         ofs += z+1+20
1149         yield (spl[0], spl[1], sha)
1150
1151
1152 _ver = None
1153 def ver():
1154     """Get Git's version and ensure a usable version is installed.
1155
1156     The returned version is formatted as an ordered tuple with each position
1157     representing a digit in the version tag. For example, the following tuple
1158     would represent version 1.6.6.9:
1159
1160         ('1', '6', '6', '9')
1161     """
1162     global _ver
1163     if not _ver:
1164         p = subprocess.Popen(['git', '--version'],
1165                              stdout=subprocess.PIPE)
1166         gvs = p.stdout.read()
1167         _git_wait('git --version', p)
1168         m = re.match(r'git version (\S+.\S+)', gvs)
1169         if not m:
1170             raise GitError('git --version weird output: %r' % gvs)
1171         _ver = tuple(m.group(1).split('.'))
1172     needed = ('1','5', '3', '1')
1173     if _ver < needed:
1174         raise GitError('git version %s or higher is required; you have %s'
1175                        % ('.'.join(needed), '.'.join(_ver)))
1176     return _ver
1177
1178
1179 def _git_wait(cmd, p):
1180     rv = p.wait()
1181     if rv != 0:
1182         raise GitError('%s returned %d' % (cmd, rv))
1183
1184
1185 def _git_capture(argv):
1186     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1187     r = p.stdout.read()
1188     _git_wait(repr(argv), p)
1189     return r
1190
1191
1192 class _AbortableIter:
1193     def __init__(self, it, onabort = None):
1194         self.it = it
1195         self.onabort = onabort
1196         self.done = None
1197
1198     def __iter__(self):
1199         return self
1200
1201     def next(self):
1202         try:
1203             return self.it.next()
1204         except StopIteration, e:
1205             self.done = True
1206             raise
1207         except:
1208             self.abort()
1209             raise
1210
1211     def abort(self):
1212         """Abort iteration and call the abortion callback, if needed."""
1213         if not self.done:
1214             self.done = True
1215             if self.onabort:
1216                 self.onabort()
1217
1218     def __del__(self):
1219         self.abort()
1220
1221
1222 _ver_warned = 0
1223 class CatPipe:
1224     """Link to 'git cat-file' that is used to retrieve blob data."""
1225     def __init__(self):
1226         global _ver_warned
1227         wanted = ('1','5','6')
1228         if ver() < wanted:
1229             if not _ver_warned:
1230                 log('warning: git version < %s; bup will be slow.\n'
1231                     % '.'.join(wanted))
1232                 _ver_warned = 1
1233             self.get = self._slow_get
1234         else:
1235             self.p = self.inprogress = None
1236             self.get = self._fast_get
1237
1238     def _abort(self):
1239         if self.p:
1240             self.p.stdout.close()
1241             self.p.stdin.close()
1242         self.p = None
1243         self.inprogress = None
1244
1245     def _restart(self):
1246         self._abort()
1247         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1248                                   stdin=subprocess.PIPE,
1249                                   stdout=subprocess.PIPE,
1250                                   close_fds = True,
1251                                   bufsize = 4096,
1252                                   preexec_fn = _gitenv)
1253
1254     def _fast_get(self, id):
1255         if not self.p or self.p.poll() != None:
1256             self._restart()
1257         assert(self.p)
1258         assert(self.p.poll() == None)
1259         if self.inprogress:
1260             log('_fast_get: opening %r while %r is open'
1261                 % (id, self.inprogress))
1262         assert(not self.inprogress)
1263         assert(id.find('\n') < 0)
1264         assert(id.find('\r') < 0)
1265         assert(not id.startswith('-'))
1266         self.inprogress = id
1267         self.p.stdin.write('%s\n' % id)
1268         self.p.stdin.flush()
1269         hdr = self.p.stdout.readline()
1270         if hdr.endswith(' missing\n'):
1271             self.inprogress = None
1272             raise KeyError('blob %r is missing' % id)
1273         spl = hdr.split(' ')
1274         if len(spl) != 3 or len(spl[0]) != 40:
1275             raise GitError('expected blob, got %r' % spl)
1276         (hex, type, size) = spl
1277
1278         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1279                            onabort = self._abort)
1280         try:
1281             yield type
1282             for blob in it:
1283                 yield blob
1284             assert(self.p.stdout.readline() == '\n')
1285             self.inprogress = None
1286         except Exception, e:
1287             it.abort()
1288             raise
1289
1290     def _slow_get(self, id):
1291         assert(id.find('\n') < 0)
1292         assert(id.find('\r') < 0)
1293         assert(id[0] != '-')
1294         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1295         yield type
1296
1297         p = subprocess.Popen(['git', 'cat-file', type, id],
1298                              stdout=subprocess.PIPE,
1299                              preexec_fn = _gitenv)
1300         for blob in chunkyreader(p.stdout):
1301             yield blob
1302         _git_wait('git cat-file', p)
1303
1304     def _join(self, it):
1305         type = it.next()
1306         if type == 'blob':
1307             for blob in it:
1308                 yield blob
1309         elif type == 'tree':
1310             treefile = ''.join(it)
1311             for (mode, name, sha) in treeparse(treefile):
1312                 for blob in self.join(sha.encode('hex')):
1313                     yield blob
1314         elif type == 'commit':
1315             treeline = ''.join(it).split('\n')[0]
1316             assert(treeline.startswith('tree '))
1317             for blob in self.join(treeline[5:]):
1318                 yield blob
1319         else:
1320             raise GitError('invalid object type %r: expected blob/tree/commit'
1321                            % type)
1322
1323     def join(self, id):
1324         """Generate a list of the content of all blobs that can be reached
1325         from an object.  The hash given in 'id' must point to a blob, a tree
1326         or a commit. The content of all blobs that can be seen from trees or
1327         commits will be added to the list.
1328         """
1329         try:
1330             for d in self._join(self.get(id)):
1331                 yield d
1332         except StopIteration:
1333             log('booger!\n')
1334
1335 def tags():
1336     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1337     tags = {}
1338     for (n,c) in list_refs():
1339         if n.startswith('refs/tags/'):
1340             name = n[10:]
1341             if not c in tags:
1342                 tags[c] = []
1343
1344             tags[c].append(name)  # more than one tag can point at 'c'
1345
1346     return tags