lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path
   8
   9 MIDX_VERSION = 4
  10 SEEK_END=2  # os.SEEK_END is not defined in python 2.4
  11
  12 """Discussion of bloom constants for bup:
  13
  14 There are four basic things to consider when building a bloom filter:
  15 The size, in bits, of the filter
  16 The capacity, in entries, of the filter
  17 The probability of a false positive that is tolerable
  18 The number of bits readily available to use for addresing filter bits
  19
  20 There is one major tunable that is not directly related to the above:
  21 k: the number of bits set in the filter per entry
  22
  23 Here's a wall of numbers showing the relationship between k; the ratio between
  24 the filter size in bits and the entries in the filter; and pfalse_positive:
  25
  26 mn|k=3    |k=4    |k=5    |k=6    |k=7    |k=8    |k=9    |k=10   |k=11
  27  8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
  28  9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
  29 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
  30 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
  31 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
  32 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
  33 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
  34 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
  35 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
  36 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
  37 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
  38 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
  39 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
  40 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
  41 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
  42 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
  43 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
  44 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
  45 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
  46 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
  47 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
  48 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
  49 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
  50 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
  51 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
  52
  53 Here's a table showing available repository size for a given pfalse_positive
  54 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
  55 filter and 8192bytes per object):
  56
  57 pfalse|obj k=4     |cap k=4    |obj k=5  |cap k=5    |obj k=6 |cap k=6
  58 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
  59 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
  60 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
  61
  62 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
  63 addressing.
  64
  65 filter size scales linearly with repository size for a given k and pfalse.
  66
  67 Here's a table of filter sizes for a 1 TiB repository:
  68
  69 pfalse| k=3        | k=4        | k=5        | k=6
  70 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
  71 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
  72 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
  73
  74 For bup:
  75 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
  76 per lookup will be worse than the two required for midx.
  77 * We want the pfalse_positive to be low enough that the cost of sometimes
  78 faulting on the midx doesn't overcome the benefit of the bloom filter.
  79 * We have readily available 160 bits for addressing the filter.
  80 * We want to be able to have a single bloom address entire repositories of
  81 reasonable size.
  82
  83 Based on these parameters, a combination of k=4 and k=5 provides the behavior
  84 that bup needs.  As such, I've implemented bloom addressing, adding and
  85 checking functions in C for these two values.  Because k=5 requires less space
  86 and gives better overall pfalse_positive perofrmance, it is preferred if a
  87 table with k=5 can represent the repository.
  88
  89 None of this tells us what max_pfalse_positive to choose.
  90
  91 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
  92 """
  93 BLOOM_VERSION = 2
  94 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
  95 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
  96 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
  97
  98 verbose = 0
  99 ignore_midx = 0
 100 home_repodir = os.path.expanduser('~/.bup')
 101 repodir = None
 102
 103 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
 104 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
 105
 106 _total_searches = 0
 107 _total_steps = 0
 108
 109
 110 class GitError(Exception):
 111     pass
 112
 113
 114 def repo(sub = ''):
 115     """Get the path to the git repository or one of its subdirectories."""
 116     global repodir
 117     if not repodir:
 118         raise GitError('You should call check_repo_or_die()')
 119
 120     # If there's a .git subdirectory, then the actual repo is in there.
 121     gd = os.path.join(repodir, '.git')
 122     if os.path.exists(gd):
 123         repodir = gd
 124
 125     return os.path.join(repodir, sub)
 126
 127
 128 def all_packdirs():
 129     paths = [repo('objects/pack')]
 130     paths += glob.glob(repo('index-cache/*/.'))
 131     return paths
 132
 133
 134 def auto_midx(objdir):
 135     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 136     try:
 137         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 138     except OSError, e:
 139         # make sure 'args' gets printed to help with debugging
 140         add_error('%r: exception: %s' % (args, e))
 141         raise
 142     if rv:
 143         add_error('%r: returned %d' % (args, rv))
 144
 145     args = [path.exe(), 'bloom', '--dir', objdir]
 146     try:
 147         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 148     except OSError, e:
 149         # make sure 'args' gets printed to help with debugging
 150         add_error('%r: exception: %s' % (args, e))
 151         raise
 152     if rv:
 153         add_error('%r: returned %d' % (args, rv))
 154
 155
 156 def mangle_name(name, mode, gitmode):
 157     """Mangle a file name to present an abstract name for segmented files.
 158     Mangled file names will have the ".bup" extension added to them. If a
 159     file's name already ends with ".bup", a ".bupl" extension is added to
 160     disambiguate normal files from semgmented ones.
 161     """
 162     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 163         return name + '.bup'
 164     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 165         return name + '.bupl'
 166     else:
 167         return name
 168
 169
 170 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 171 def demangle_name(name):
 172     """Remove name mangling from a file name, if necessary.
 173
 174     The return value is a tuple (demangled_filename,mode), where mode is one of
 175     the following:
 176
 177     * BUP_NORMAL  : files that should be read as-is from the repository
 178     * BUP_CHUNKED : files that were chunked and need to be assembled
 179
 180     For more information on the name mangling algorythm, see mangle_name()
 181     """
 182     if name.endswith('.bupl'):
 183         return (name[:-5], BUP_NORMAL)
 184     elif name.endswith('.bup'):
 185         return (name[:-4], BUP_CHUNKED)
 186     else:
 187         return (name, BUP_NORMAL)
 188
 189
 190 def _encode_packobj(type, content):
 191     szout = ''
 192     sz = len(content)
 193     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 194     sz >>= 4
 195     while 1:
 196         if sz: szbits |= 0x80
 197         szout += chr(szbits)
 198         if not sz:
 199             break
 200         szbits = sz & 0x7f
 201         sz >>= 7
 202     z = zlib.compressobj(1)
 203     yield szout
 204     yield z.compress(content)
 205     yield z.flush()
 206
 207
 208 def _encode_looseobj(type, content):
 209     z = zlib.compressobj(1)
 210     yield z.compress('%s %d\0' % (type, len(content)))
 211     yield z.compress(content)
 212     yield z.flush()
 213
 214
 215 def _decode_looseobj(buf):
 216     assert(buf);
 217     s = zlib.decompress(buf)
 218     i = s.find('\0')
 219     assert(i > 0)
 220     l = s[:i].split(' ')
 221     type = l[0]
 222     sz = int(l[1])
 223     content = s[i+1:]
 224     assert(type in _typemap)
 225     assert(sz == len(content))
 226     return (type, content)
 227
 228
 229 def _decode_packobj(buf):
 230     assert(buf)
 231     c = ord(buf[0])
 232     type = _typermap[(c & 0x70) >> 4]
 233     sz = c & 0x0f
 234     shift = 4
 235     i = 0
 236     while c & 0x80:
 237         i += 1
 238         c = ord(buf[i])
 239         sz |= (c & 0x7f) << shift
 240         shift += 7
 241         if not (c & 0x80):
 242             break
 243     return (type, zlib.decompress(buf[i+1:]))
 244
 245
 246 class PackIdx:
 247     def __init__(self):
 248         assert(0)
 249
 250     def find_offset(self, hash):
 251         """Get the offset of an object inside the index file."""
 252         idx = self._idx_from_hash(hash)
 253         if idx != None:
 254             return self._ofs_from_idx(idx)
 255         return None
 256
 257     def exists(self, hash, want_source=False):
 258         """Return nonempty if the object exists in this index."""
 259         if hash and (self._idx_from_hash(hash) != None):
 260             return want_source and os.path.basename(self.name) or True
 261         return None
 262
 263     def __len__(self):
 264         return int(self.fanout[255])
 265
 266     def _idx_from_hash(self, hash):
 267         global _total_searches, _total_steps
 268         _total_searches += 1
 269         assert(len(hash) == 20)
 270         b1 = ord(hash[0])
 271         start = self.fanout[b1-1] # range -1..254
 272         end = self.fanout[b1] # range 0..255
 273         want = str(hash)
 274         _total_steps += 1  # lookup table is a step
 275         while start < end:
 276             _total_steps += 1
 277             mid = start + (end-start)/2
 278             v = self._idx_to_hash(mid)
 279             if v < want:
 280                 start = mid+1
 281             elif v > want:
 282                 end = mid
 283             else: # got it!
 284                 return mid
 285         return None
 286
 287
 288 class PackIdxV1(PackIdx):
 289     """Object representation of a Git pack index (version 1) file."""
 290     def __init__(self, filename, f):
 291         self.name = filename
 292         self.idxnames = [self.name]
 293         self.map = mmap_read(f)
 294         self.fanout = list(struct.unpack('!256I',
 295                                          str(buffer(self.map, 0, 256*4))))
 296         self.fanout.append(0)  # entry "-1"
 297         nsha = self.fanout[255]
 298         self.sha_ofs = 256*4
 299         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 300
 301     def _ofs_from_idx(self, idx):
 302         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 303
 304     def _idx_to_hash(self, idx):
 305         return str(self.shatable[idx*24+4 : idx*24+24])
 306
 307     def __iter__(self):
 308         for i in xrange(self.fanout[255]):
 309             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 310
 311
 312 class PackIdxV2(PackIdx):
 313     """Object representation of a Git pack index (version 2) file."""
 314     def __init__(self, filename, f):
 315         self.name = filename
 316         self.idxnames = [self.name]
 317         self.map = mmap_read(f)
 318         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 319         self.fanout = list(struct.unpack('!256I',
 320                                          str(buffer(self.map, 8, 256*4))))
 321         self.fanout.append(0)  # entry "-1"
 322         nsha = self.fanout[255]
 323         self.sha_ofs = 8 + 256*4
 324         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 325         self.ofstable = buffer(self.map,
 326                                self.sha_ofs + nsha*20 + nsha*4,
 327                                nsha*4)
 328         self.ofs64table = buffer(self.map,
 329                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 330
 331     def _ofs_from_idx(self, idx):
 332         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 333         if ofs & 0x80000000:
 334             idx64 = ofs & 0x7fffffff
 335             ofs = struct.unpack('!Q',
 336                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 337         return ofs
 338
 339     def _idx_to_hash(self, idx):
 340         return str(self.shatable[idx*20:(idx+1)*20])
 341
 342     def __iter__(self):
 343         for i in xrange(self.fanout[255]):
 344             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 345
 346
 347 extract_bits = _helpers.extract_bits
 348
 349 bloom_contains = _helpers.bloom_contains
 350 bloom_add = _helpers.bloom_add
 351
 352
 353 class ShaBloom:
 354     """Wrapper which contains data from multiple index files.
 355     """
 356     def __init__(self, filename, f=None, readwrite=False, expected=-1):
 357         self.name = filename
 358         self.rwfile = None
 359         self.map = None
 360         assert(filename.endswith('.bloom'))
 361         if readwrite:
 362             assert(expected > 0)
 363             self.rwfile = f = f or open(filename, 'r+b')
 364             f.seek(0)
 365
 366             # Decide if we want to mmap() the pages as writable ('immediate'
 367             # write) or else map them privately for later writing back to
 368             # the file ('delayed' write).  A bloom table's write access
 369             # pattern is such that we dirty almost all the pages after adding
 370             # very few entries.  But the table is so big that dirtying
 371             # *all* the pages often exceeds Linux's default
 372             # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
 373             # thus causing it to start flushing the table before we're
 374             # finished... even though there's more than enough space to
 375             # store the bloom table in RAM.
 376             #
 377             # To work around that behaviour, if we calculate that we'll
 378             # probably end up touching the whole table anyway (at least
 379             # one bit flipped per memory page), let's use a "private" mmap,
 380             # which defeats Linux's ability to flush it to disk.  Then we'll
 381             # flush it as one big lump during close().
 382             pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
 383             self.delaywrite = expected > pages
 384             debug1('bloom: delaywrite=%r\n' % self.delaywrite)
 385             if self.delaywrite:
 386                 self.map = mmap_readwrite_private(self.rwfile, close=False)
 387             else:
 388                 self.map = mmap_readwrite(self.rwfile, close=False)
 389         else:
 390             self.rwfile = None
 391             f = f or open(filename, 'rb')
 392             self.map = mmap_read(f)
 393         got = str(self.map[0:4])
 394         if got != 'BLOM':
 395             log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
 396             return self._init_failed()
 397         ver = struct.unpack('!I', self.map[4:8])[0]
 398         if ver < BLOOM_VERSION:
 399             log('Warning: ignoring old-style (v%d) bloom %r\n'
 400                 % (ver, filename))
 401             return self._init_failed()
 402         if ver > BLOOM_VERSION:
 403             log('Warning: ignoring too-new (v%d) bloom %r\n'
 404                 % (ver, filename))
 405             return self._init_failed()
 406
 407         self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
 408         idxnamestr = str(self.map[16 + 2**self.bits:])
 409         if idxnamestr:
 410             self.idxnames = idxnamestr.split('\0')
 411         else:
 412             self.idxnames = []
 413
 414     def _init_failed(self):
 415         if self.map:
 416             self.map = None
 417         if self.rwfile:
 418             self.rwfile.close()
 419             self.rwfile = None
 420         self.idxnames = []
 421         self.bits = self.entries = 0
 422
 423     def valid(self):
 424         return self.map and self.bits
 425
 426     def __del__(self):
 427         self.close()
 428
 429     def close(self):
 430         if self.map and self.rwfile:
 431             debug2("bloom: closing with %d entries\n" % self.entries)
 432             self.map[12:16] = struct.pack('!I', self.entries)
 433             if self.delaywrite:
 434                 self.rwfile.seek(0)
 435                 self.rwfile.write(self.map)
 436             else:
 437                 self.map.flush()
 438             self.rwfile.seek(16 + 2**self.bits)
 439             if self.idxnames:
 440                 self.rwfile.write('\0'.join(self.idxnames))
 441         self._init_failed()
 442
 443     def pfalse_positive(self, additional=0):
 444         n = self.entries + additional
 445         m = 8*2**self.bits
 446         k = self.k
 447         return 100*(1-math.exp(-k*float(n)/m))**k
 448
 449     def add_idx(self, ix):
 450         """Add the object to the filter, return current pfalse_positive."""
 451         if not self.map: raise Exception, "Cannot add to closed bloom"
 452         self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
 453         self.idxnames.append(os.path.basename(ix.name))
 454
 455     def exists(self, sha):
 456         """Return nonempty if the object probably exists in the bloom filter."""
 457         global _total_searches, _total_steps
 458         _total_searches += 1
 459         if not self.map: return None
 460         found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
 461         _total_steps += steps
 462         return found
 463
 464     @classmethod
 465     def create(cls, name, expected, delaywrite=None, f=None, k=None):
 466         """Create and return a bloom filter for `expected` entries."""
 467         bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
 468         k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
 469         if bits > MAX_BLOOM_BITS[k]:
 470             log('bloom: warning, max bits exceeded, non-optimal\n')
 471             bits = MAX_BLOOM_BITS[k]
 472         debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
 473         f = f or open(name, 'w+b')
 474         f.write('BLOM')
 475         f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
 476         assert(f.tell() == 16)
 477         # NOTE: On some systems this will not extend+zerofill, but it does on
 478         # darwin, linux, bsd and solaris.
 479         f.truncate(16+2**bits)
 480         f.seek(0)
 481         if delaywrite != None and not delaywrite:
 482             # tell it to expect very few objects, forcing a direct mmap
 483             expected = 1
 484         return cls(name, f=f, readwrite=True, expected=expected)
 485
 486     def __len__(self):
 487         return int(self.entries)
 488
 489
 490 class PackMidx:
 491     """Wrapper which contains data from multiple index files.
 492     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 493     and make it possible for bup to expand Git's indexing capabilities to vast
 494     amounts of files.
 495     """
 496     def __init__(self, filename):
 497         self.name = filename
 498         self.force_keep = False
 499         assert(filename.endswith('.midx'))
 500         self.map = mmap_read(open(filename))
 501         if str(self.map[0:4]) != 'MIDX':
 502             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 503             self.force_keep = True
 504             return self._init_failed()
 505         ver = struct.unpack('!I', self.map[4:8])[0]
 506         if ver < MIDX_VERSION:
 507             log('Warning: ignoring old-style (v%d) midx %r\n'
 508                 % (ver, filename))
 509             self.force_keep = False  # old stuff is boring
 510             return self._init_failed()
 511         if ver > MIDX_VERSION:
 512             log('Warning: ignoring too-new (v%d) midx %r\n'
 513                 % (ver, filename))
 514             self.force_keep = True  # new stuff is exciting
 515             return self._init_failed()
 516
 517         self.bits = _helpers.firstword(self.map[8:12])
 518         self.entries = 2**self.bits
 519         self.fanout = buffer(self.map, 12, self.entries*4)
 520         self.sha_ofs = 12 + self.entries*4
 521         self.nsha = nsha = self._fanget(self.entries-1)
 522         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 523         self.which_ofs = self.sha_ofs + 20*nsha
 524         self.whichlist = buffer(self.map, self.which_ofs, nsha*4)
 525         self.idxnames = str(self.map[self.which_ofs + 4*nsha:]).split('\0')
 526
 527     def _init_failed(self):
 528         self.bits = 0
 529         self.entries = 1
 530         self.fanout = buffer('\0\0\0\0')
 531         self.shatable = buffer('\0'*20)
 532         self.idxnames = []
 533
 534     def _fanget(self, i):
 535         start = i*4
 536         s = self.fanout[start:start+4]
 537         return _helpers.firstword(s)
 538
 539     def _get(self, i):
 540         return str(self.shatable[i*20:(i+1)*20])
 541
 542     def _get_idx_i(self, i):
 543         return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
 544
 545     def _get_idxname(self, i):
 546         return self.idxnames[self._get_idx_i(i)]
 547
 548     def exists(self, hash, want_source=False):
 549         """Return nonempty if the object exists in the index files."""
 550         global _total_searches, _total_steps
 551         _total_searches += 1
 552         want = str(hash)
 553         el = extract_bits(want, self.bits)
 554         if el:
 555             start = self._fanget(el-1)
 556             startv = el << (32-self.bits)
 557         else:
 558             start = 0
 559             startv = 0
 560         end = self._fanget(el)
 561         endv = (el+1) << (32-self.bits)
 562         _total_steps += 1   # lookup table is a step
 563         hashv = _helpers.firstword(hash)
 564         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 565         while start < end:
 566             _total_steps += 1
 567             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 568             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 569             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 570             v = self._get(mid)
 571             #print '    %08x' % self._num(v)
 572             if v < want:
 573                 start = mid+1
 574                 startv = _helpers.firstword(v)
 575             elif v > want:
 576                 end = mid
 577                 endv = _helpers.firstword(v)
 578             else: # got it!
 579                 return want_source and self._get_idxname(mid) or True
 580         return None
 581
 582     def __iter__(self):
 583         for i in xrange(self._fanget(self.entries-1)):
 584             yield buffer(self.shatable, i*20, 20)
 585
 586     def __len__(self):
 587         return int(self._fanget(self.entries-1))
 588
 589
 590 _mpi_count = 0
 591 class PackIdxList:
 592     def __init__(self, dir):
 593         global _mpi_count
 594         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 595         _mpi_count += 1
 596         self.dir = dir
 597         self.also = set()
 598         self.packs = []
 599         self.do_bloom = False
 600         self.bloom = None
 601         self.refresh()
 602
 603     def __del__(self):
 604         global _mpi_count
 605         _mpi_count -= 1
 606         assert(_mpi_count == 0)
 607
 608     def __iter__(self):
 609         return iter(idxmerge(self.packs))
 610
 611     def __len__(self):
 612         return sum(len(pack) for pack in self.packs)
 613
 614     def exists(self, hash, want_source=False):
 615         """Return nonempty if the object exists in the index files."""
 616         global _total_searches
 617         _total_searches += 1
 618         if hash in self.also:
 619             return True
 620         if self.do_bloom and self.bloom is not None:
 621             _total_searches -= 1  # will be incremented by bloom
 622             if self.bloom.exists(hash):
 623                 self.do_bloom = False
 624             else:
 625                 return None
 626         for i in xrange(len(self.packs)):
 627             p = self.packs[i]
 628             _total_searches -= 1  # will be incremented by sub-pack
 629             ix = p.exists(hash, want_source=want_source)
 630             if ix:
 631                 # reorder so most recently used packs are searched first
 632                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 633                 return ix
 634         self.do_bloom = True
 635         return None
 636
 637     def refresh(self, skip_midx = False):
 638         """Refresh the index list.
 639         This method verifies if .midx files were superseded (e.g. all of its
 640         contents are in another, bigger .midx file) and removes the superseded
 641         files.
 642
 643         If skip_midx is True, all work on .midx files will be skipped and .midx
 644         files will be removed from the list.
 645
 646         The module-global variable 'ignore_midx' can force this function to
 647         always act as if skip_midx was True.
 648         """
 649         self.bloom = None # Always reopen the bloom as it may have been relaced
 650         self.do_bloom = False
 651         skip_midx = skip_midx or ignore_midx
 652         d = dict((p.name, p) for p in self.packs
 653                  if not skip_midx or not isinstance(p, PackMidx))
 654         if os.path.exists(self.dir):
 655             if not skip_midx:
 656                 midxl = []
 657                 for ix in self.packs:
 658                     if isinstance(ix, PackMidx):
 659                         for name in ix.idxnames:
 660                             d[os.path.join(self.dir, name)] = ix
 661                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 662                     if not d.get(full):
 663                         mx = PackMidx(full)
 664                         (mxd, mxf) = os.path.split(mx.name)
 665                         broken = False
 666                         for n in mx.idxnames:
 667                             if not os.path.exists(os.path.join(mxd, n)):
 668                                 log(('warning: index %s missing\n' +
 669                                     '  used by %s\n') % (n, mxf))
 670                                 broken = True
 671                         if broken:
 672                             del mx
 673                             unlink(full)
 674                         else:
 675                             midxl.append(mx)
 676                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 677                 for ix in midxl:
 678                     any_needed = False
 679                     for sub in ix.idxnames:
 680                         found = d.get(os.path.join(self.dir, sub))
 681                         if not found or isinstance(found, PackIdx):
 682                             # doesn't exist, or exists but not in a midx
 683                             any_needed = True
 684                             break
 685                     if any_needed:
 686                         d[ix.name] = ix
 687                         for name in ix.idxnames:
 688                             d[os.path.join(self.dir, name)] = ix
 689                     elif not ix.force_keep:
 690                         debug1('midx: removing redundant: %s\n'
 691                                % os.path.basename(ix.name))
 692                         unlink(ix.name)
 693             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 694                 if not d.get(full):
 695                     try:
 696                         ix = open_idx(full)
 697                     except GitError, e:
 698                         add_error(e)
 699                         continue
 700                     d[full] = ix
 701             bfull = os.path.join(self.dir, 'bup.bloom')
 702             if self.bloom is None and os.path.exists(bfull):
 703                 self.bloom = ShaBloom(bfull)
 704             self.packs = list(set(d.values()))
 705             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 706             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 707                 self.do_bloom = True
 708             else:
 709                 self.bloom = None
 710         debug1('PackIdxList: using %d index%s.\n'
 711             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 712
 713     def add(self, hash):
 714         """Insert an additional object in the list."""
 715         self.also.add(hash)
 716
 717
 718 def calc_hash(type, content):
 719     """Calculate some content's hash in the Git fashion."""
 720     header = '%s %d\0' % (type, len(content))
 721     sum = Sha1(header)
 722     sum.update(content)
 723     return sum.digest()
 724
 725
 726 def _shalist_sort_key(ent):
 727     (mode, name, id) = ent
 728     if stat.S_ISDIR(int(mode, 8)):
 729         return name + '/'
 730     else:
 731         return name
 732
 733
 734 def open_idx(filename):
 735     if filename.endswith('.idx'):
 736         f = open(filename, 'rb')
 737         header = f.read(8)
 738         if header[0:4] == '\377tOc':
 739             version = struct.unpack('!I', header[4:8])[0]
 740             if version == 2:
 741                 return PackIdxV2(filename, f)
 742             else:
 743                 raise GitError('%s: expected idx file version 2, got %d'
 744                                % (filename, version))
 745         elif len(header) == 8 and header[0:4] < '\377tOc':
 746             return PackIdxV1(filename, f)
 747         else:
 748             raise GitError('%s: unrecognized idx file header' % filename)
 749     elif filename.endswith('.midx'):
 750         return PackMidx(filename)
 751     else:
 752         raise GitError('idx filenames must end with .idx or .midx')
 753
 754
 755 def idxmerge(idxlist, final_progress=True):
 756     """Generate a list of all the objects reachable in a PackIdxList."""
 757     def pfunc(count, total):
 758         progress('Reading indexes: %.2f%% (%d/%d)\r'
 759                  % (count*100.0/total, count, total))
 760     def pfinal(count, total):
 761         if final_progress:
 762             log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 763     return merge_iter(idxlist, 10024, pfunc, pfinal)
 764
 765
 766 def _make_objcache():
 767     return PackIdxList(repo('objects/pack'))
 768
 769 class PackWriter:
 770     """Writes Git objects insid a pack file."""
 771     def __init__(self, objcache_maker=_make_objcache):
 772         self.count = 0
 773         self.outbytes = 0
 774         self.filename = None
 775         self.file = None
 776         self.idx = None
 777         self.objcache_maker = objcache_maker
 778         self.objcache = None
 779
 780     def __del__(self):
 781         self.close()
 782
 783     def _open(self):
 784         if not self.file:
 785             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 786             self.file = os.fdopen(fd, 'w+b')
 787             assert(name.endswith('.pack'))
 788             self.filename = name[:-5]
 789             self.file.write('PACK\0\0\0\2\0\0\0\0')
 790             self.idx = list(list() for i in xrange(256))
 791
 792     def _raw_write(self, datalist, sha):
 793         self._open()
 794         f = self.file
 795         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 796         # the file never has a *partial* blob.  So let's make sure it's
 797         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 798         # to our hashsplit algorithm.)  f.write() does its own buffering,
 799         # but that's okay because we'll flush it in _end().
 800         oneblob = ''.join(datalist)
 801         try:
 802             f.write(oneblob)
 803         except IOError, e:
 804             raise GitError, e, sys.exc_info()[2]
 805         nw = len(oneblob)
 806         crc = zlib.crc32(oneblob) & 0xffffffff
 807         self._update_idx(sha, crc, nw)
 808         self.outbytes += nw
 809         self.count += 1
 810         return nw, crc
 811
 812     def _update_idx(self, sha, crc, size):
 813         assert(sha)
 814         if self.idx:
 815             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 816
 817     def _write(self, sha, type, content):
 818         if verbose:
 819             log('>')
 820         if not sha:
 821             sha = calc_hash(type, content)
 822         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 823         return sha
 824
 825     def breakpoint(self):
 826         """Clear byte and object counts and return the last processed id."""
 827         id = self._end()
 828         self.outbytes = self.count = 0
 829         return id
 830
 831     def _require_objcache(self):
 832         if self.objcache is None and self.objcache_maker:
 833             self.objcache = self.objcache_maker()
 834         if self.objcache is None:
 835             raise GitError(
 836                     "PackWriter not opened or can't check exists w/o objcache")
 837
 838     def exists(self, id, want_source=False):
 839         """Return non-empty if an object is found in the object cache."""
 840         self._require_objcache()
 841         return self.objcache.exists(id, want_source=want_source)
 842
 843     def maybe_write(self, type, content):
 844         """Write an object to the pack file if not present and return its id."""
 845         self._require_objcache()
 846         sha = calc_hash(type, content)
 847         if not self.exists(sha):
 848             self._write(sha, type, content)
 849             self.objcache.add(sha)
 850         return sha
 851
 852     def new_blob(self, blob):
 853         """Create a blob object in the pack with the supplied content."""
 854         return self.maybe_write('blob', blob)
 855
 856     def new_tree(self, shalist):
 857         """Create a tree object in the pack."""
 858         shalist = sorted(shalist, key = _shalist_sort_key)
 859         l = []
 860         for (mode,name,bin) in shalist:
 861             assert(mode)
 862             assert(mode != '0')
 863             assert(mode[0] != '0')
 864             assert(name)
 865             assert(len(bin) == 20)
 866             l.append('%s %s\0%s' % (mode,name,bin))
 867         return self.maybe_write('tree', ''.join(l))
 868
 869     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 870         l = []
 871         if tree: l.append('tree %s' % tree.encode('hex'))
 872         if parent: l.append('parent %s' % parent.encode('hex'))
 873         if author: l.append('author %s %s' % (author, _git_date(adate)))
 874         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 875         l.append('')
 876         l.append(msg)
 877         return self.maybe_write('commit', '\n'.join(l))
 878
 879     def new_commit(self, parent, tree, date, msg):
 880         """Create a commit object in the pack."""
 881         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 882         commit = self._new_commit(tree, parent,
 883                                   userline, date, userline, date,
 884                                   msg)
 885         return commit
 886
 887     def abort(self):
 888         """Remove the pack file from disk."""
 889         f = self.file
 890         if f:
 891             self.idx = None
 892             self.file = None
 893             f.close()
 894             os.unlink(self.filename + '.pack')
 895
 896     def _end(self, run_midx=True):
 897         f = self.file
 898         if not f: return None
 899         self.file = None
 900         self.objcache = None
 901         idx = self.idx
 902         self.idx = None
 903
 904         # update object count
 905         f.seek(8)
 906         cp = struct.pack('!i', self.count)
 907         assert(len(cp) == 4)
 908         f.write(cp)
 909
 910         # calculate the pack sha1sum
 911         f.seek(0)
 912         sum = Sha1()
 913         for b in chunkyreader(f):
 914             sum.update(b)
 915         packbin = sum.digest()
 916         f.write(packbin)
 917         f.close()
 918
 919         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 920
 921         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 922         if os.path.exists(self.filename + '.map'):
 923             os.unlink(self.filename + '.map')
 924         os.rename(self.filename + '.pack', nameprefix + '.pack')
 925         os.rename(self.filename + '.idx', nameprefix + '.idx')
 926
 927         if run_midx:
 928             auto_midx(repo('objects/pack'))
 929         return nameprefix
 930
 931     def close(self, run_midx=True):
 932         """Close the pack file and move it to its definitive path."""
 933         return self._end(run_midx=run_midx)
 934
 935     def _write_pack_idx_v2(self, filename, idx, packbin):
 936         idx_f = open(filename, 'w+b')
 937         idx_f.write('\377tOc\0\0\0\2')
 938
 939         ofs64_ofs = 8 + 4*256 + 28*self.count
 940         idx_f.truncate(ofs64_ofs)
 941         idx_f.seek(0)
 942         idx_map = mmap_readwrite(idx_f, close=False)
 943         idx_f.seek(0, SEEK_END)
 944         count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
 945         assert(count == self.count)
 946         idx_map.close()
 947         idx_f.write(packbin)
 948
 949         idx_f.seek(0)
 950         idx_sum = Sha1()
 951         b = idx_f.read(8 + 4*256)
 952         idx_sum.update(b)
 953
 954         obj_list_sum = Sha1()
 955         for b in chunkyreader(idx_f, 20*self.count):
 956             idx_sum.update(b)
 957             obj_list_sum.update(b)
 958         namebase = obj_list_sum.hexdigest()
 959
 960         for b in chunkyreader(idx_f):
 961             idx_sum.update(b)
 962         idx_f.write(idx_sum.digest())
 963         idx_f.close()
 964
 965         return namebase
 966
 967
 968 def _git_date(date):
 969     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 970
 971
 972 def _gitenv():
 973     os.environ['GIT_DIR'] = os.path.abspath(repo())
 974
 975
 976 def list_refs(refname = None):
 977     """Generate a list of tuples in the form (refname,hash).
 978     If a ref name is specified, list only this particular ref.
 979     """
 980     argv = ['git', 'show-ref', '--']
 981     if refname:
 982         argv += [refname]
 983     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 984     out = p.stdout.read().strip()
 985     rv = p.wait()  # not fatal
 986     if rv:
 987         assert(not out)
 988     if out:
 989         for d in out.split('\n'):
 990             (sha, name) = d.split(' ', 1)
 991             yield (name, sha.decode('hex'))
 992
 993
 994 def read_ref(refname):
 995     """Get the commit id of the most recent commit made on a given ref."""
 996     l = list(list_refs(refname))
 997     if l:
 998         assert(len(l) == 1)
 999         return l[0][1]
1000     else:
1001         return None
1002
1003
1004 def rev_list(ref, count=None):
1005     """Generate a list of reachable commits in reverse chronological order.
1006
1007     This generator walks through commits, from child to parent, that are
1008     reachable via the specified ref and yields a series of tuples of the form
1009     (date,hash).
1010
1011     If count is a non-zero integer, limit the number of commits to "count"
1012     objects.
1013     """
1014     assert(not ref.startswith('-'))
1015     opts = []
1016     if count:
1017         opts += ['-n', str(atoi(count))]
1018     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
1019     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
1020     commit = None
1021     for row in p.stdout:
1022         s = row.strip()
1023         if s.startswith('commit '):
1024             commit = s[7:].decode('hex')
1025         else:
1026             date = int(s)
1027             yield (date, commit)
1028     rv = p.wait()  # not fatal
1029     if rv:
1030         raise GitError, 'git rev-list returned error %d' % rv
1031
1032
1033 def rev_get_date(ref):
1034     """Get the date of the latest commit on the specified ref."""
1035     for (date, commit) in rev_list(ref, count=1):
1036         return date
1037     raise GitError, 'no such commit %r' % ref
1038
1039
1040 def rev_parse(committish):
1041     """Resolve the full hash for 'committish', if it exists.
1042
1043     Should be roughly equivalent to 'git rev-parse'.
1044
1045     Returns the hex value of the hash if it is found, None if 'committish' does
1046     not correspond to anything.
1047     """
1048     head = read_ref(committish)
1049     if head:
1050         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1051         return head
1052
1053     pL = PackIdxList(repo('objects/pack'))
1054
1055     if len(committish) == 40:
1056         try:
1057             hash = committish.decode('hex')
1058         except TypeError:
1059             return None
1060
1061         if pL.exists(hash):
1062             return hash
1063
1064     return None
1065
1066
1067 def update_ref(refname, newval, oldval):
1068     """Change the commit pointed to by a branch."""
1069     if not oldval:
1070         oldval = ''
1071     assert(refname.startswith('refs/heads/'))
1072     p = subprocess.Popen(['git', 'update-ref', refname,
1073                           newval.encode('hex'), oldval.encode('hex')],
1074                          preexec_fn = _gitenv)
1075     _git_wait('git update-ref', p)
1076
1077
1078 def guess_repo(path=None):
1079     """Set the path value in the global variable "repodir".
1080     This makes bup look for an existing bup repository, but not fail if a
1081     repository doesn't exist. Usually, if you are interacting with a bup
1082     repository, you would not be calling this function but using
1083     check_repo_or_die().
1084     """
1085     global repodir
1086     if path:
1087         repodir = path
1088     if not repodir:
1089         repodir = os.environ.get('BUP_DIR')
1090         if not repodir:
1091             repodir = os.path.expanduser('~/.bup')
1092
1093
1094 def init_repo(path=None):
1095     """Create the Git bare repository for bup in a given path."""
1096     guess_repo(path)
1097     d = repo()  # appends a / to the path
1098     parent = os.path.dirname(os.path.dirname(d))
1099     if parent and not os.path.exists(parent):
1100         raise GitError('parent directory "%s" does not exist\n' % parent)
1101     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1102         raise GitError('"%d" exists but is not a directory\n' % d)
1103     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1104                          preexec_fn = _gitenv)
1105     _git_wait('git init', p)
1106     # Force the index version configuration in order to ensure bup works
1107     # regardless of the version of the installed Git binary.
1108     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1109                          stdout=sys.stderr, preexec_fn = _gitenv)
1110     _git_wait('git config', p)
1111
1112
1113 def check_repo_or_die(path=None):
1114     """Make sure a bup repository exists, and abort if not.
1115     If the path to a particular repository was not specified, this function
1116     initializes the default repository automatically.
1117     """
1118     guess_repo(path)
1119     if not os.path.isdir(repo('objects/pack/.')):
1120         if repodir == home_repodir:
1121             init_repo()
1122         else:
1123             log('error: %r is not a bup/git repository\n' % repo())
1124             sys.exit(15)
1125
1126
1127 def treeparse(buf):
1128     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1129     ofs = 0
1130     while ofs < len(buf):
1131         z = buf[ofs:].find('\0')
1132         assert(z > 0)
1133         spl = buf[ofs:ofs+z].split(' ', 1)
1134         assert(len(spl) == 2)
1135         sha = buf[ofs+z+1:ofs+z+1+20]
1136         ofs += z+1+20
1137         yield (spl[0], spl[1], sha)
1138
1139
1140 _ver = None
1141 def ver():
1142     """Get Git's version and ensure a usable version is installed.
1143
1144     The returned version is formatted as an ordered tuple with each position
1145     representing a digit in the version tag. For example, the following tuple
1146     would represent version 1.6.6.9:
1147
1148         ('1', '6', '6', '9')
1149     """
1150     global _ver
1151     if not _ver:
1152         p = subprocess.Popen(['git', '--version'],
1153                              stdout=subprocess.PIPE)
1154         gvs = p.stdout.read()
1155         _git_wait('git --version', p)
1156         m = re.match(r'git version (\S+.\S+)', gvs)
1157         if not m:
1158             raise GitError('git --version weird output: %r' % gvs)
1159         _ver = tuple(m.group(1).split('.'))
1160     needed = ('1','5', '3', '1')
1161     if _ver < needed:
1162         raise GitError('git version %s or higher is required; you have %s'
1163                        % ('.'.join(needed), '.'.join(_ver)))
1164     return _ver
1165
1166
1167 def _git_wait(cmd, p):
1168     rv = p.wait()
1169     if rv != 0:
1170         raise GitError('%s returned %d' % (cmd, rv))
1171
1172
1173 def _git_capture(argv):
1174     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1175     r = p.stdout.read()
1176     _git_wait(repr(argv), p)
1177     return r
1178
1179
1180 class _AbortableIter:
1181     def __init__(self, it, onabort = None):
1182         self.it = it
1183         self.onabort = onabort
1184         self.done = None
1185
1186     def __iter__(self):
1187         return self
1188
1189     def next(self):
1190         try:
1191             return self.it.next()
1192         except StopIteration, e:
1193             self.done = True
1194             raise
1195         except:
1196             self.abort()
1197             raise
1198
1199     def abort(self):
1200         """Abort iteration and call the abortion callback, if needed."""
1201         if not self.done:
1202             self.done = True
1203             if self.onabort:
1204                 self.onabort()
1205
1206     def __del__(self):
1207         self.abort()
1208
1209
1210 _ver_warned = 0
1211 class CatPipe:
1212     """Link to 'git cat-file' that is used to retrieve blob data."""
1213     def __init__(self):
1214         global _ver_warned
1215         wanted = ('1','5','6')
1216         if ver() < wanted:
1217             if not _ver_warned:
1218                 log('warning: git version < %s; bup will be slow.\n'
1219                     % '.'.join(wanted))
1220                 _ver_warned = 1
1221             self.get = self._slow_get
1222         else:
1223             self.p = self.inprogress = None
1224             self.get = self._fast_get
1225
1226     def _abort(self):
1227         if self.p:
1228             self.p.stdout.close()
1229             self.p.stdin.close()
1230         self.p = None
1231         self.inprogress = None
1232
1233     def _restart(self):
1234         self._abort()
1235         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1236                                   stdin=subprocess.PIPE,
1237                                   stdout=subprocess.PIPE,
1238                                   close_fds = True,
1239                                   bufsize = 4096,
1240                                   preexec_fn = _gitenv)
1241
1242     def _fast_get(self, id):
1243         if not self.p or self.p.poll() != None:
1244             self._restart()
1245         assert(self.p)
1246         assert(self.p.poll() == None)
1247         if self.inprogress:
1248             log('_fast_get: opening %r while %r is open'
1249                 % (id, self.inprogress))
1250         assert(not self.inprogress)
1251         assert(id.find('\n') < 0)
1252         assert(id.find('\r') < 0)
1253         assert(not id.startswith('-'))
1254         self.inprogress = id
1255         self.p.stdin.write('%s\n' % id)
1256         self.p.stdin.flush()
1257         hdr = self.p.stdout.readline()
1258         if hdr.endswith(' missing\n'):
1259             self.inprogress = None
1260             raise KeyError('blob %r is missing' % id)
1261         spl = hdr.split(' ')
1262         if len(spl) != 3 or len(spl[0]) != 40:
1263             raise GitError('expected blob, got %r' % spl)
1264         (hex, type, size) = spl
1265
1266         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1267                            onabort = self._abort)
1268         try:
1269             yield type
1270             for blob in it:
1271                 yield blob
1272             assert(self.p.stdout.readline() == '\n')
1273             self.inprogress = None
1274         except Exception, e:
1275             it.abort()
1276             raise
1277
1278     def _slow_get(self, id):
1279         assert(id.find('\n') < 0)
1280         assert(id.find('\r') < 0)
1281         assert(id[0] != '-')
1282         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1283         yield type
1284
1285         p = subprocess.Popen(['git', 'cat-file', type, id],
1286                              stdout=subprocess.PIPE,
1287                              preexec_fn = _gitenv)
1288         for blob in chunkyreader(p.stdout):
1289             yield blob
1290         _git_wait('git cat-file', p)
1291
1292     def _join(self, it):
1293         type = it.next()
1294         if type == 'blob':
1295             for blob in it:
1296                 yield blob
1297         elif type == 'tree':
1298             treefile = ''.join(it)
1299             for (mode, name, sha) in treeparse(treefile):
1300                 for blob in self.join(sha.encode('hex')):
1301                     yield blob
1302         elif type == 'commit':
1303             treeline = ''.join(it).split('\n')[0]
1304             assert(treeline.startswith('tree '))
1305             for blob in self.join(treeline[5:]):
1306                 yield blob
1307         else:
1308             raise GitError('invalid object type %r: expected blob/tree/commit'
1309                            % type)
1310
1311     def join(self, id):
1312         """Generate a list of the content of all blobs that can be reached
1313         from an object.  The hash given in 'id' must point to a blob, a tree
1314         or a commit. The content of all blobs that can be seen from trees or
1315         commits will be added to the list.
1316         """
1317         try:
1318             for d in self._join(self.get(id)):
1319                 yield d
1320         except StopIteration:
1321             log('booger!\n')
1322
1323 def tags():
1324     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1325     tags = {}
1326     for (n,c) in list_refs():
1327         if n.startswith('refs/tags/'):
1328             name = n[10:]
1329             if not c in tags:
1330                 tags[c] = []
1331
1332             tags[c].append(name)  # more than one tag can point at 'c'
1333
1334     return tags