lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path
   8
   9 MIDX_VERSION = 4
  10 SEEK_END=2  # os.SEEK_END is not defined in python 2.4
  11
  12 """Discussion of bloom constants for bup:
  13
  14 There are four basic things to consider when building a bloom filter:
  15 The size, in bits, of the filter
  16 The capacity, in entries, of the filter
  17 The probability of a false positive that is tolerable
  18 The number of bits readily available to use for addresing filter bits
  19
  20 There is one major tunable that is not directly related to the above:
  21 k: the number of bits set in the filter per entry
  22
  23 Here's a wall of numbers showing the relationship between k; the ratio between
  24 the filter size in bits and the entries in the filter; and pfalse_positive:
  25
  26 mn|k=3    |k=4    |k=5    |k=6    |k=7    |k=8    |k=9    |k=10   |k=11
  27  8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
  28  9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
  29 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
  30 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
  31 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
  32 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
  33 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
  34 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
  35 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
  36 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
  37 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
  38 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
  39 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
  40 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
  41 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
  42 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
  43 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
  44 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
  45 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
  46 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
  47 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
  48 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
  49 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
  50 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
  51 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
  52
  53 Here's a table showing available repository size for a given pfalse_positive
  54 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
  55 filter and 8192bytes per object):
  56
  57 pfalse|obj k=4     |cap k=4    |obj k=5  |cap k=5    |obj k=6 |cap k=6
  58 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
  59 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
  60 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
  61
  62 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
  63 addressing.
  64
  65 filter size scales linearly with repository size for a given k and pfalse.
  66
  67 Here's a table of filter sizes for a 1 TiB repository:
  68
  69 pfalse| k=3        | k=4        | k=5        | k=6
  70 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
  71 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
  72 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
  73
  74 For bup:
  75 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
  76 per lookup will be worse than the two required for midx.
  77 * We want the pfalse_positive to be low enough that the cost of sometimes
  78 faulting on the midx doesn't overcome the benefit of the bloom filter.
  79 * We have readily available 160 bits for addressing the filter.
  80 * We want to be able to have a single bloom address entire repositories of
  81 reasonable size.
  82
  83 Based on these parameters, a combination of k=4 and k=5 provides the behavior
  84 that bup needs.  As such, I've implemented bloom addressing, adding and
  85 checking functions in C for these two values.  Because k=5 requires less space
  86 and gives better overall pfalse_positive perofrmance, it is preferred if a
  87 table with k=5 can represent the repository.
  88
  89 None of this tells us what max_pfalse_positive to choose.
  90
  91 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
  92 """
  93 BLOOM_VERSION = 2
  94 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
  95 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
  96 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
  97
  98 verbose = 0
  99 ignore_midx = 0
 100 home_repodir = os.path.expanduser('~/.bup')
 101 repodir = None
 102
 103 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
 104 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
 105
 106 _total_searches = 0
 107 _total_steps = 0
 108
 109
 110 class GitError(Exception):
 111     pass
 112
 113
 114 def repo(sub = ''):
 115     """Get the path to the git repository or one of its subdirectories."""
 116     global repodir
 117     if not repodir:
 118         raise GitError('You should call check_repo_or_die()')
 119
 120     # If there's a .git subdirectory, then the actual repo is in there.
 121     gd = os.path.join(repodir, '.git')
 122     if os.path.exists(gd):
 123         repodir = gd
 124
 125     return os.path.join(repodir, sub)
 126
 127
 128 def auto_midx(objdir):
 129     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 130     try:
 131         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 132     except OSError, e:
 133         # make sure 'args' gets printed to help with debugging
 134         add_error('%r: exception: %s' % (args, e))
 135         raise
 136     if rv:
 137         add_error('%r: returned %d' % (args, rv))
 138
 139     args = [path.exe(), 'bloom', '--dir', objdir]
 140     try:
 141         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 142     except OSError, e:
 143         # make sure 'args' gets printed to help with debugging
 144         add_error('%r: exception: %s' % (args, e))
 145         raise
 146     if rv:
 147         add_error('%r: returned %d' % (args, rv))
 148
 149
 150 def mangle_name(name, mode, gitmode):
 151     """Mangle a file name to present an abstract name for segmented files.
 152     Mangled file names will have the ".bup" extension added to them. If a
 153     file's name already ends with ".bup", a ".bupl" extension is added to
 154     disambiguate normal files from semgmented ones.
 155     """
 156     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 157         return name + '.bup'
 158     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 159         return name + '.bupl'
 160     else:
 161         return name
 162
 163
 164 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 165 def demangle_name(name):
 166     """Remove name mangling from a file name, if necessary.
 167
 168     The return value is a tuple (demangled_filename,mode), where mode is one of
 169     the following:
 170
 171     * BUP_NORMAL  : files that should be read as-is from the repository
 172     * BUP_CHUNKED : files that were chunked and need to be assembled
 173
 174     For more information on the name mangling algorythm, see mangle_name()
 175     """
 176     if name.endswith('.bupl'):
 177         return (name[:-5], BUP_NORMAL)
 178     elif name.endswith('.bup'):
 179         return (name[:-4], BUP_CHUNKED)
 180     else:
 181         return (name, BUP_NORMAL)
 182
 183
 184 def _encode_packobj(type, content):
 185     szout = ''
 186     sz = len(content)
 187     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 188     sz >>= 4
 189     while 1:
 190         if sz: szbits |= 0x80
 191         szout += chr(szbits)
 192         if not sz:
 193             break
 194         szbits = sz & 0x7f
 195         sz >>= 7
 196     z = zlib.compressobj(1)
 197     yield szout
 198     yield z.compress(content)
 199     yield z.flush()
 200
 201
 202 def _encode_looseobj(type, content):
 203     z = zlib.compressobj(1)
 204     yield z.compress('%s %d\0' % (type, len(content)))
 205     yield z.compress(content)
 206     yield z.flush()
 207
 208
 209 def _decode_looseobj(buf):
 210     assert(buf);
 211     s = zlib.decompress(buf)
 212     i = s.find('\0')
 213     assert(i > 0)
 214     l = s[:i].split(' ')
 215     type = l[0]
 216     sz = int(l[1])
 217     content = s[i+1:]
 218     assert(type in _typemap)
 219     assert(sz == len(content))
 220     return (type, content)
 221
 222
 223 def _decode_packobj(buf):
 224     assert(buf)
 225     c = ord(buf[0])
 226     type = _typermap[(c & 0x70) >> 4]
 227     sz = c & 0x0f
 228     shift = 4
 229     i = 0
 230     while c & 0x80:
 231         i += 1
 232         c = ord(buf[i])
 233         sz |= (c & 0x7f) << shift
 234         shift += 7
 235         if not (c & 0x80):
 236             break
 237     return (type, zlib.decompress(buf[i+1:]))
 238
 239
 240 class PackIdx:
 241     def __init__(self):
 242         assert(0)
 243
 244     def find_offset(self, hash):
 245         """Get the offset of an object inside the index file."""
 246         idx = self._idx_from_hash(hash)
 247         if idx != None:
 248             return self._ofs_from_idx(idx)
 249         return None
 250
 251     def exists(self, hash, want_source=False):
 252         """Return nonempty if the object exists in this index."""
 253         if hash and (self._idx_from_hash(hash) != None):
 254             return want_source and os.path.basename(self.name) or True
 255         return None
 256
 257     def __len__(self):
 258         return int(self.fanout[255])
 259
 260     def _idx_from_hash(self, hash):
 261         global _total_searches, _total_steps
 262         _total_searches += 1
 263         assert(len(hash) == 20)
 264         b1 = ord(hash[0])
 265         start = self.fanout[b1-1] # range -1..254
 266         end = self.fanout[b1] # range 0..255
 267         want = str(hash)
 268         _total_steps += 1  # lookup table is a step
 269         while start < end:
 270             _total_steps += 1
 271             mid = start + (end-start)/2
 272             v = self._idx_to_hash(mid)
 273             if v < want:
 274                 start = mid+1
 275             elif v > want:
 276                 end = mid
 277             else: # got it!
 278                 return mid
 279         return None
 280
 281
 282 class PackIdxV1(PackIdx):
 283     """Object representation of a Git pack index (version 1) file."""
 284     def __init__(self, filename, f):
 285         self.name = filename
 286         self.idxnames = [self.name]
 287         self.map = mmap_read(f)
 288         self.fanout = list(struct.unpack('!256I',
 289                                          str(buffer(self.map, 0, 256*4))))
 290         self.fanout.append(0)  # entry "-1"
 291         nsha = self.fanout[255]
 292         self.sha_ofs = 256*4
 293         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 294
 295     def _ofs_from_idx(self, idx):
 296         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 297
 298     def _idx_to_hash(self, idx):
 299         return str(self.shatable[idx*24+4 : idx*24+24])
 300
 301     def __iter__(self):
 302         for i in xrange(self.fanout[255]):
 303             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 304
 305
 306 class PackIdxV2(PackIdx):
 307     """Object representation of a Git pack index (version 2) file."""
 308     def __init__(self, filename, f):
 309         self.name = filename
 310         self.idxnames = [self.name]
 311         self.map = mmap_read(f)
 312         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 313         self.fanout = list(struct.unpack('!256I',
 314                                          str(buffer(self.map, 8, 256*4))))
 315         self.fanout.append(0)  # entry "-1"
 316         nsha = self.fanout[255]
 317         self.sha_ofs = 8 + 256*4
 318         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 319         self.ofstable = buffer(self.map,
 320                                self.sha_ofs + nsha*20 + nsha*4,
 321                                nsha*4)
 322         self.ofs64table = buffer(self.map,
 323                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 324
 325     def _ofs_from_idx(self, idx):
 326         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 327         if ofs & 0x80000000:
 328             idx64 = ofs & 0x7fffffff
 329             ofs = struct.unpack('!Q',
 330                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 331         return ofs
 332
 333     def _idx_to_hash(self, idx):
 334         return str(self.shatable[idx*20:(idx+1)*20])
 335
 336     def __iter__(self):
 337         for i in xrange(self.fanout[255]):
 338             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 339
 340
 341 extract_bits = _helpers.extract_bits
 342
 343 bloom_contains = _helpers.bloom_contains
 344 bloom_add = _helpers.bloom_add
 345
 346
 347 class ShaBloom:
 348     """Wrapper which contains data from multiple index files.
 349     """
 350     def __init__(self, filename, f=None, readwrite=False, expected=-1):
 351         self.name = filename
 352         self.rwfile = None
 353         self.map = None
 354         assert(filename.endswith('.bloom'))
 355         if readwrite:
 356             assert(expected > 0)
 357             self.rwfile = f = f or open(filename, 'r+b')
 358             f.seek(0)
 359
 360             # Decide if we want to mmap() the pages as writable ('immediate'
 361             # write) or else map them privately for later writing back to
 362             # the file ('delayed' write).  A bloom table's write access
 363             # pattern is such that we dirty almost all the pages after adding
 364             # very few entries.  But the table is so big that dirtying
 365             # *all* the pages often exceeds Linux's default
 366             # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
 367             # thus causing it to start flushing the table before we're
 368             # finished... even though there's more than enough space to
 369             # store the bloom table in RAM.
 370             #
 371             # To work around that behaviour, if we calculate that we'll
 372             # probably end up touching the whole table anyway (at least
 373             # one bit flipped per memory page), let's use a "private" mmap,
 374             # which defeats Linux's ability to flush it to disk.  Then we'll
 375             # flush it as one big lump during close().
 376             pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
 377             self.delaywrite = expected > pages
 378             debug1('bloom: delaywrite=%r\n' % self.delaywrite)
 379             if self.delaywrite:
 380                 self.map = mmap_readwrite_private(self.rwfile, close=False)
 381             else:
 382                 self.map = mmap_readwrite(self.rwfile, close=False)
 383         else:
 384             self.rwfile = None
 385             f = f or open(filename, 'rb')
 386             self.map = mmap_read(f)
 387         got = str(self.map[0:4])
 388         if got != 'BLOM':
 389             log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
 390             return self._init_failed()
 391         ver = struct.unpack('!I', self.map[4:8])[0]
 392         if ver < BLOOM_VERSION:
 393             log('Warning: ignoring old-style (v%d) bloom %r\n'
 394                 % (ver, filename))
 395             return self._init_failed()
 396         if ver > BLOOM_VERSION:
 397             log('Warning: ignoring too-new (v%d) bloom %r\n'
 398                 % (ver, filename))
 399             return self._init_failed()
 400
 401         self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
 402         idxnamestr = str(self.map[16 + 2**self.bits:])
 403         if idxnamestr:
 404             self.idxnames = idxnamestr.split('\0')
 405         else:
 406             self.idxnames = []
 407
 408     def _init_failed(self):
 409         if self.map:
 410             self.map = None
 411         if self.rwfile:
 412             self.rwfile.close()
 413             self.rwfile = None
 414         self.idxnames = []
 415         self.bits = self.entries = 0
 416
 417     def valid(self):
 418         return self.map and self.bits
 419
 420     def __del__(self):
 421         self.close()
 422
 423     def close(self):
 424         if self.map and self.rwfile:
 425             debug2("bloom: closing with %d entries\n" % self.entries)
 426             self.map[12:16] = struct.pack('!I', self.entries)
 427             if self.delaywrite:
 428                 self.rwfile.seek(0)
 429                 self.rwfile.write(self.map)
 430             else:
 431                 self.map.flush()
 432             self.rwfile.seek(16 + 2**self.bits)
 433             if self.idxnames:
 434                 self.rwfile.write('\0'.join(self.idxnames))
 435         self._init_failed()
 436
 437     def pfalse_positive(self, additional=0):
 438         n = self.entries + additional
 439         m = 8*2**self.bits
 440         k = self.k
 441         return 100*(1-math.exp(-k*float(n)/m))**k
 442
 443     def add_idx(self, ix):
 444         """Add the object to the filter, return current pfalse_positive."""
 445         if not self.map: raise Exception, "Cannot add to closed bloom"
 446         self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
 447         self.idxnames.append(os.path.basename(ix.name))
 448
 449     def exists(self, sha):
 450         """Return nonempty if the object probably exists in the bloom filter."""
 451         global _total_searches, _total_steps
 452         _total_searches += 1
 453         if not self.map: return None
 454         found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
 455         _total_steps += steps
 456         return found
 457
 458     @classmethod
 459     def create(cls, name, expected, delaywrite=None, f=None, k=None):
 460         """Create and return a bloom filter for `expected` entries."""
 461         bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
 462         k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
 463         if bits > MAX_BLOOM_BITS[k]:
 464             log('bloom: warning, max bits exceeded, non-optimal\n')
 465             bits = MAX_BLOOM_BITS[k]
 466         debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
 467         f = f or open(name, 'w+b')
 468         f.write('BLOM')
 469         f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
 470         assert(f.tell() == 16)
 471         # NOTE: On some systems this will not extend+zerofill, but it does on
 472         # darwin, linux, bsd and solaris.
 473         f.truncate(16+2**bits)
 474         f.seek(0)
 475         if delaywrite != None and not delaywrite:
 476             # tell it to expect very few objects, forcing a direct mmap
 477             expected = 1
 478         return cls(name, f=f, readwrite=True, expected=expected)
 479
 480     def __len__(self):
 481         return int(self.entries)
 482
 483
 484 class PackMidx:
 485     """Wrapper which contains data from multiple index files.
 486     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 487     and make it possible for bup to expand Git's indexing capabilities to vast
 488     amounts of files.
 489     """
 490     def __init__(self, filename):
 491         self.name = filename
 492         self.force_keep = False
 493         assert(filename.endswith('.midx'))
 494         self.map = mmap_read(open(filename))
 495         if str(self.map[0:4]) != 'MIDX':
 496             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 497             self.force_keep = True
 498             return self._init_failed()
 499         ver = struct.unpack('!I', self.map[4:8])[0]
 500         if ver < MIDX_VERSION:
 501             log('Warning: ignoring old-style (v%d) midx %r\n'
 502                 % (ver, filename))
 503             self.force_keep = False  # old stuff is boring
 504             return self._init_failed()
 505         if ver > MIDX_VERSION:
 506             log('Warning: ignoring too-new (v%d) midx %r\n'
 507                 % (ver, filename))
 508             self.force_keep = True  # new stuff is exciting
 509             return self._init_failed()
 510
 511         self.bits = _helpers.firstword(self.map[8:12])
 512         self.entries = 2**self.bits
 513         self.fanout = buffer(self.map, 12, self.entries*4)
 514         self.sha_ofs = 12 + self.entries*4
 515         self.nsha = nsha = self._fanget(self.entries-1)
 516         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 517         self.which_ofs = self.sha_ofs + 20*nsha
 518         self.whichlist = buffer(self.map, self.which_ofs, nsha*4)
 519         self.idxnames = str(self.map[self.which_ofs + 4*nsha:]).split('\0')
 520
 521     def _init_failed(self):
 522         self.bits = 0
 523         self.entries = 1
 524         self.fanout = buffer('\0\0\0\0')
 525         self.shatable = buffer('\0'*20)
 526         self.idxnames = []
 527
 528     def _fanget(self, i):
 529         start = i*4
 530         s = self.fanout[start:start+4]
 531         return _helpers.firstword(s)
 532
 533     def _get(self, i):
 534         return str(self.shatable[i*20:(i+1)*20])
 535
 536     def _get_idx_i(self, i):
 537         return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
 538
 539     def _get_idxname(self, i):
 540         return self.idxnames[self._get_idx_i(i)]
 541
 542     def exists(self, hash, want_source=False):
 543         """Return nonempty if the object exists in the index files."""
 544         global _total_searches, _total_steps
 545         _total_searches += 1
 546         want = str(hash)
 547         el = extract_bits(want, self.bits)
 548         if el:
 549             start = self._fanget(el-1)
 550             startv = el << (32-self.bits)
 551         else:
 552             start = 0
 553             startv = 0
 554         end = self._fanget(el)
 555         endv = (el+1) << (32-self.bits)
 556         _total_steps += 1   # lookup table is a step
 557         hashv = _helpers.firstword(hash)
 558         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 559         while start < end:
 560             _total_steps += 1
 561             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 562             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 563             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 564             v = self._get(mid)
 565             #print '    %08x' % self._num(v)
 566             if v < want:
 567                 start = mid+1
 568                 startv = _helpers.firstword(v)
 569             elif v > want:
 570                 end = mid
 571                 endv = _helpers.firstword(v)
 572             else: # got it!
 573                 return want_source and self._get_idxname(mid) or True
 574         return None
 575
 576     def __iter__(self):
 577         for i in xrange(self._fanget(self.entries-1)):
 578             yield buffer(self.shatable, i*20, 20)
 579
 580     def __len__(self):
 581         return int(self._fanget(self.entries-1))
 582
 583
 584 _mpi_count = 0
 585 class PackIdxList:
 586     def __init__(self, dir):
 587         global _mpi_count
 588         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 589         _mpi_count += 1
 590         self.dir = dir
 591         self.also = set()
 592         self.packs = []
 593         self.do_bloom = False
 594         self.bloom = None
 595         self.refresh()
 596
 597     def __del__(self):
 598         global _mpi_count
 599         _mpi_count -= 1
 600         assert(_mpi_count == 0)
 601
 602     def __iter__(self):
 603         return iter(idxmerge(self.packs))
 604
 605     def __len__(self):
 606         return sum(len(pack) for pack in self.packs)
 607
 608     def exists(self, hash, want_source=False):
 609         """Return nonempty if the object exists in the index files."""
 610         global _total_searches
 611         _total_searches += 1
 612         if hash in self.also:
 613             return True
 614         if self.do_bloom and self.bloom is not None:
 615             _total_searches -= 1  # will be incremented by bloom
 616             if self.bloom.exists(hash):
 617                 self.do_bloom = False
 618             else:
 619                 return None
 620         for i in xrange(len(self.packs)):
 621             p = self.packs[i]
 622             _total_searches -= 1  # will be incremented by sub-pack
 623             ix = p.exists(hash, want_source=want_source)
 624             if ix:
 625                 # reorder so most recently used packs are searched first
 626                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 627                 return ix
 628         self.do_bloom = True
 629         return None
 630
 631     def refresh(self, skip_midx = False):
 632         """Refresh the index list.
 633         This method verifies if .midx files were superseded (e.g. all of its
 634         contents are in another, bigger .midx file) and removes the superseded
 635         files.
 636
 637         If skip_midx is True, all work on .midx files will be skipped and .midx
 638         files will be removed from the list.
 639
 640         The module-global variable 'ignore_midx' can force this function to
 641         always act as if skip_midx was True.
 642         """
 643         self.bloom = None # Always reopen the bloom as it may have been relaced
 644         self.do_bloom = False
 645         skip_midx = skip_midx or ignore_midx
 646         d = dict((p.name, p) for p in self.packs
 647                  if not skip_midx or not isinstance(p, PackMidx))
 648         if os.path.exists(self.dir):
 649             if not skip_midx:
 650                 midxl = []
 651                 for ix in self.packs:
 652                     if isinstance(ix, PackMidx):
 653                         for name in ix.idxnames:
 654                             d[os.path.join(self.dir, name)] = ix
 655                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 656                     if not d.get(full):
 657                         mx = PackMidx(full)
 658                         (mxd, mxf) = os.path.split(mx.name)
 659                         broken = False
 660                         for n in mx.idxnames:
 661                             if not os.path.exists(os.path.join(mxd, n)):
 662                                 log(('warning: index %s missing\n' +
 663                                     '  used by %s\n') % (n, mxf))
 664                                 broken = True
 665                         if broken:
 666                             del mx
 667                             unlink(full)
 668                         else:
 669                             midxl.append(mx)
 670                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 671                 for ix in midxl:
 672                     any_needed = False
 673                     for sub in ix.idxnames:
 674                         found = d.get(os.path.join(self.dir, sub))
 675                         if not found or isinstance(found, PackIdx):
 676                             # doesn't exist, or exists but not in a midx
 677                             any_needed = True
 678                             break
 679                     if any_needed:
 680                         d[ix.name] = ix
 681                         for name in ix.idxnames:
 682                             d[os.path.join(self.dir, name)] = ix
 683                     elif not ix.force_keep:
 684                         debug1('midx: removing redundant: %s\n'
 685                                % os.path.basename(ix.name))
 686                         unlink(ix.name)
 687             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 688                 if not d.get(full):
 689                     try:
 690                         ix = open_idx(full)
 691                     except GitError, e:
 692                         add_error(e)
 693                         continue
 694                     d[full] = ix
 695             bfull = os.path.join(self.dir, 'bup.bloom')
 696             if self.bloom is None and os.path.exists(bfull):
 697                 self.bloom = ShaBloom(bfull)
 698             self.packs = list(set(d.values()))
 699             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 700             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 701                 self.do_bloom = True
 702             else:
 703                 self.bloom = None
 704         debug1('PackIdxList: using %d index%s.\n'
 705             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 706
 707     def add(self, hash):
 708         """Insert an additional object in the list."""
 709         self.also.add(hash)
 710
 711
 712 def calc_hash(type, content):
 713     """Calculate some content's hash in the Git fashion."""
 714     header = '%s %d\0' % (type, len(content))
 715     sum = Sha1(header)
 716     sum.update(content)
 717     return sum.digest()
 718
 719
 720 def _shalist_sort_key(ent):
 721     (mode, name, id) = ent
 722     if stat.S_ISDIR(int(mode, 8)):
 723         return name + '/'
 724     else:
 725         return name
 726
 727
 728 def open_idx(filename):
 729     if filename.endswith('.idx'):
 730         f = open(filename, 'rb')
 731         header = f.read(8)
 732         if header[0:4] == '\377tOc':
 733             version = struct.unpack('!I', header[4:8])[0]
 734             if version == 2:
 735                 return PackIdxV2(filename, f)
 736             else:
 737                 raise GitError('%s: expected idx file version 2, got %d'
 738                                % (filename, version))
 739         elif len(header) == 8 and header[0:4] < '\377tOc':
 740             return PackIdxV1(filename, f)
 741         else:
 742             raise GitError('%s: unrecognized idx file header' % filename)
 743     elif filename.endswith('.midx'):
 744         return PackMidx(filename)
 745     else:
 746         raise GitError('idx filenames must end with .idx or .midx')
 747
 748
 749 def idxmerge(idxlist, final_progress=True):
 750     """Generate a list of all the objects reachable in a PackIdxList."""
 751     def pfunc(count, total):
 752         progress('Reading indexes: %.2f%% (%d/%d)\r'
 753                  % (count*100.0/total, count, total))
 754     def pfinal(count, total):
 755         if final_progress:
 756             log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 757     return merge_iter(idxlist, 10024, pfunc, pfinal)
 758
 759
 760 def _make_objcache():
 761     return PackIdxList(repo('objects/pack'))
 762
 763 class PackWriter:
 764     """Writes Git objects insid a pack file."""
 765     def __init__(self, objcache_maker=_make_objcache):
 766         self.count = 0
 767         self.outbytes = 0
 768         self.filename = None
 769         self.file = None
 770         self.idx = None
 771         self.objcache_maker = objcache_maker
 772         self.objcache = None
 773
 774     def __del__(self):
 775         self.close()
 776
 777     def _open(self):
 778         if not self.file:
 779             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 780             self.file = os.fdopen(fd, 'w+b')
 781             assert(name.endswith('.pack'))
 782             self.filename = name[:-5]
 783             self.file.write('PACK\0\0\0\2\0\0\0\0')
 784             self.idx = list(list() for i in xrange(256))
 785
 786     def _raw_write(self, datalist, sha):
 787         self._open()
 788         f = self.file
 789         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 790         # the file never has a *partial* blob.  So let's make sure it's
 791         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 792         # to our hashsplit algorithm.)  f.write() does its own buffering,
 793         # but that's okay because we'll flush it in _end().
 794         oneblob = ''.join(datalist)
 795         try:
 796             f.write(oneblob)
 797         except IOError, e:
 798             raise GitError, e, sys.exc_info()[2]
 799         nw = len(oneblob)
 800         crc = zlib.crc32(oneblob) & 0xffffffff
 801         self._update_idx(sha, crc, nw)
 802         self.outbytes += nw
 803         self.count += 1
 804         return nw, crc
 805
 806     def _update_idx(self, sha, crc, size):
 807         assert(sha)
 808         if self.idx:
 809             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 810
 811     def _write(self, sha, type, content):
 812         if verbose:
 813             log('>')
 814         if not sha:
 815             sha = calc_hash(type, content)
 816         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 817         return sha
 818
 819     def breakpoint(self):
 820         """Clear byte and object counts and return the last processed id."""
 821         id = self._end()
 822         self.outbytes = self.count = 0
 823         return id
 824
 825     def _require_objcache(self):
 826         if self.objcache is None and self.objcache_maker:
 827             self.objcache = self.objcache_maker()
 828         if self.objcache is None:
 829             raise GitError(
 830                     "PackWriter not opened or can't check exists w/o objcache")
 831
 832     def exists(self, id, want_source=False):
 833         """Return non-empty if an object is found in the object cache."""
 834         self._require_objcache()
 835         return self.objcache.exists(id, want_source=want_source)
 836
 837     def maybe_write(self, type, content):
 838         """Write an object to the pack file if not present and return its id."""
 839         self._require_objcache()
 840         sha = calc_hash(type, content)
 841         if not self.exists(sha):
 842             self._write(sha, type, content)
 843             self.objcache.add(sha)
 844         return sha
 845
 846     def new_blob(self, blob):
 847         """Create a blob object in the pack with the supplied content."""
 848         return self.maybe_write('blob', blob)
 849
 850     def new_tree(self, shalist):
 851         """Create a tree object in the pack."""
 852         shalist = sorted(shalist, key = _shalist_sort_key)
 853         l = []
 854         for (mode,name,bin) in shalist:
 855             assert(mode)
 856             assert(mode != '0')
 857             assert(mode[0] != '0')
 858             assert(name)
 859             assert(len(bin) == 20)
 860             l.append('%s %s\0%s' % (mode,name,bin))
 861         return self.maybe_write('tree', ''.join(l))
 862
 863     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 864         l = []
 865         if tree: l.append('tree %s' % tree.encode('hex'))
 866         if parent: l.append('parent %s' % parent.encode('hex'))
 867         if author: l.append('author %s %s' % (author, _git_date(adate)))
 868         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 869         l.append('')
 870         l.append(msg)
 871         return self.maybe_write('commit', '\n'.join(l))
 872
 873     def new_commit(self, parent, tree, date, msg):
 874         """Create a commit object in the pack."""
 875         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 876         commit = self._new_commit(tree, parent,
 877                                   userline, date, userline, date,
 878                                   msg)
 879         return commit
 880
 881     def abort(self):
 882         """Remove the pack file from disk."""
 883         f = self.file
 884         if f:
 885             self.idx = None
 886             self.file = None
 887             f.close()
 888             os.unlink(self.filename + '.pack')
 889
 890     def _end(self, run_midx=True):
 891         f = self.file
 892         if not f: return None
 893         self.file = None
 894         self.objcache = None
 895         idx = self.idx
 896         self.idx = None
 897
 898         # update object count
 899         f.seek(8)
 900         cp = struct.pack('!i', self.count)
 901         assert(len(cp) == 4)
 902         f.write(cp)
 903
 904         # calculate the pack sha1sum
 905         f.seek(0)
 906         sum = Sha1()
 907         for b in chunkyreader(f):
 908             sum.update(b)
 909         packbin = sum.digest()
 910         f.write(packbin)
 911         f.close()
 912
 913         obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
 914
 915         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 916         if os.path.exists(self.filename + '.map'):
 917             os.unlink(self.filename + '.map')
 918         os.rename(self.filename + '.pack', nameprefix + '.pack')
 919         os.rename(self.filename + '.idx', nameprefix + '.idx')
 920
 921         if run_midx:
 922             auto_midx(repo('objects/pack'))
 923         return nameprefix
 924
 925     def close(self, run_midx=True):
 926         """Close the pack file and move it to its definitive path."""
 927         return self._end(run_midx=run_midx)
 928
 929     def _write_pack_idx_v2(self, filename, idx, packbin):
 930         idx_f = open(filename, 'w+b')
 931         idx_f.write('\377tOc\0\0\0\2')
 932
 933         ofs64_ofs = 8 + 4*256 + 28*self.count
 934         idx_f.truncate(ofs64_ofs)
 935         idx_f.seek(0)
 936         idx_map = mmap_readwrite(idx_f, close=False)
 937         idx_f.seek(0, SEEK_END)
 938         count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
 939         assert(count == self.count)
 940         idx_map.close()
 941         idx_f.write(packbin)
 942
 943         idx_f.seek(0)
 944         idx_sum = Sha1()
 945         b = idx_f.read(8 + 4*256)
 946         idx_sum.update(b)
 947
 948         obj_list_sum = Sha1()
 949         for b in chunkyreader(idx_f, 20*self.count):
 950             idx_sum.update(b)
 951             obj_list_sum.update(b)
 952         namebase = obj_list_sum.hexdigest()
 953
 954         for b in chunkyreader(idx_f):
 955             idx_sum.update(b)
 956         idx_f.write(idx_sum.digest())
 957         idx_f.close()
 958
 959         return namebase
 960
 961
 962 def _git_date(date):
 963     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 964
 965
 966 def _gitenv():
 967     os.environ['GIT_DIR'] = os.path.abspath(repo())
 968
 969
 970 def list_refs(refname = None):
 971     """Generate a list of tuples in the form (refname,hash).
 972     If a ref name is specified, list only this particular ref.
 973     """
 974     argv = ['git', 'show-ref', '--']
 975     if refname:
 976         argv += [refname]
 977     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 978     out = p.stdout.read().strip()
 979     rv = p.wait()  # not fatal
 980     if rv:
 981         assert(not out)
 982     if out:
 983         for d in out.split('\n'):
 984             (sha, name) = d.split(' ', 1)
 985             yield (name, sha.decode('hex'))
 986
 987
 988 def read_ref(refname):
 989     """Get the commit id of the most recent commit made on a given ref."""
 990     l = list(list_refs(refname))
 991     if l:
 992         assert(len(l) == 1)
 993         return l[0][1]
 994     else:
 995         return None
 996
 997
 998 def rev_list(ref, count=None):
 999     """Generate a list of reachable commits in reverse chronological order.
1000
1001     This generator walks through commits, from child to parent, that are
1002     reachable via the specified ref and yields a series of tuples of the form
1003     (date,hash).
1004
1005     If count is a non-zero integer, limit the number of commits to "count"
1006     objects.
1007     """
1008     assert(not ref.startswith('-'))
1009     opts = []
1010     if count:
1011         opts += ['-n', str(atoi(count))]
1012     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
1013     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
1014     commit = None
1015     for row in p.stdout:
1016         s = row.strip()
1017         if s.startswith('commit '):
1018             commit = s[7:].decode('hex')
1019         else:
1020             date = int(s)
1021             yield (date, commit)
1022     rv = p.wait()  # not fatal
1023     if rv:
1024         raise GitError, 'git rev-list returned error %d' % rv
1025
1026
1027 def rev_get_date(ref):
1028     """Get the date of the latest commit on the specified ref."""
1029     for (date, commit) in rev_list(ref, count=1):
1030         return date
1031     raise GitError, 'no such commit %r' % ref
1032
1033
1034 def rev_parse(committish):
1035     """Resolve the full hash for 'committish', if it exists.
1036
1037     Should be roughly equivalent to 'git rev-parse'.
1038
1039     Returns the hex value of the hash if it is found, None if 'committish' does
1040     not correspond to anything.
1041     """
1042     head = read_ref(committish)
1043     if head:
1044         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1045         return head
1046
1047     pL = PackIdxList(repo('objects/pack'))
1048
1049     if len(committish) == 40:
1050         try:
1051             hash = committish.decode('hex')
1052         except TypeError:
1053             return None
1054
1055         if pL.exists(hash):
1056             return hash
1057
1058     return None
1059
1060
1061 def update_ref(refname, newval, oldval):
1062     """Change the commit pointed to by a branch."""
1063     if not oldval:
1064         oldval = ''
1065     assert(refname.startswith('refs/heads/'))
1066     p = subprocess.Popen(['git', 'update-ref', refname,
1067                           newval.encode('hex'), oldval.encode('hex')],
1068                          preexec_fn = _gitenv)
1069     _git_wait('git update-ref', p)
1070
1071
1072 def guess_repo(path=None):
1073     """Set the path value in the global variable "repodir".
1074     This makes bup look for an existing bup repository, but not fail if a
1075     repository doesn't exist. Usually, if you are interacting with a bup
1076     repository, you would not be calling this function but using
1077     check_repo_or_die().
1078     """
1079     global repodir
1080     if path:
1081         repodir = path
1082     if not repodir:
1083         repodir = os.environ.get('BUP_DIR')
1084         if not repodir:
1085             repodir = os.path.expanduser('~/.bup')
1086
1087
1088 def init_repo(path=None):
1089     """Create the Git bare repository for bup in a given path."""
1090     guess_repo(path)
1091     d = repo()  # appends a / to the path
1092     parent = os.path.dirname(os.path.dirname(d))
1093     if parent and not os.path.exists(parent):
1094         raise GitError('parent directory "%s" does not exist\n' % parent)
1095     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1096         raise GitError('"%d" exists but is not a directory\n' % d)
1097     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1098                          preexec_fn = _gitenv)
1099     _git_wait('git init', p)
1100     # Force the index version configuration in order to ensure bup works
1101     # regardless of the version of the installed Git binary.
1102     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1103                          stdout=sys.stderr, preexec_fn = _gitenv)
1104     _git_wait('git config', p)
1105
1106
1107 def check_repo_or_die(path=None):
1108     """Make sure a bup repository exists, and abort if not.
1109     If the path to a particular repository was not specified, this function
1110     initializes the default repository automatically.
1111     """
1112     guess_repo(path)
1113     if not os.path.isdir(repo('objects/pack/.')):
1114         if repodir == home_repodir:
1115             init_repo()
1116         else:
1117             log('error: %r is not a bup/git repository\n' % repo())
1118             sys.exit(15)
1119
1120
1121 def treeparse(buf):
1122     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1123     ofs = 0
1124     while ofs < len(buf):
1125         z = buf[ofs:].find('\0')
1126         assert(z > 0)
1127         spl = buf[ofs:ofs+z].split(' ', 1)
1128         assert(len(spl) == 2)
1129         sha = buf[ofs+z+1:ofs+z+1+20]
1130         ofs += z+1+20
1131         yield (spl[0], spl[1], sha)
1132
1133
1134 _ver = None
1135 def ver():
1136     """Get Git's version and ensure a usable version is installed.
1137
1138     The returned version is formatted as an ordered tuple with each position
1139     representing a digit in the version tag. For example, the following tuple
1140     would represent version 1.6.6.9:
1141
1142         ('1', '6', '6', '9')
1143     """
1144     global _ver
1145     if not _ver:
1146         p = subprocess.Popen(['git', '--version'],
1147                              stdout=subprocess.PIPE)
1148         gvs = p.stdout.read()
1149         _git_wait('git --version', p)
1150         m = re.match(r'git version (\S+.\S+)', gvs)
1151         if not m:
1152             raise GitError('git --version weird output: %r' % gvs)
1153         _ver = tuple(m.group(1).split('.'))
1154     needed = ('1','5', '3', '1')
1155     if _ver < needed:
1156         raise GitError('git version %s or higher is required; you have %s'
1157                        % ('.'.join(needed), '.'.join(_ver)))
1158     return _ver
1159
1160
1161 def _git_wait(cmd, p):
1162     rv = p.wait()
1163     if rv != 0:
1164         raise GitError('%s returned %d' % (cmd, rv))
1165
1166
1167 def _git_capture(argv):
1168     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1169     r = p.stdout.read()
1170     _git_wait(repr(argv), p)
1171     return r
1172
1173
1174 class _AbortableIter:
1175     def __init__(self, it, onabort = None):
1176         self.it = it
1177         self.onabort = onabort
1178         self.done = None
1179
1180     def __iter__(self):
1181         return self
1182
1183     def next(self):
1184         try:
1185             return self.it.next()
1186         except StopIteration, e:
1187             self.done = True
1188             raise
1189         except:
1190             self.abort()
1191             raise
1192
1193     def abort(self):
1194         """Abort iteration and call the abortion callback, if needed."""
1195         if not self.done:
1196             self.done = True
1197             if self.onabort:
1198                 self.onabort()
1199
1200     def __del__(self):
1201         self.abort()
1202
1203
1204 _ver_warned = 0
1205 class CatPipe:
1206     """Link to 'git cat-file' that is used to retrieve blob data."""
1207     def __init__(self):
1208         global _ver_warned
1209         wanted = ('1','5','6')
1210         if ver() < wanted:
1211             if not _ver_warned:
1212                 log('warning: git version < %s; bup will be slow.\n'
1213                     % '.'.join(wanted))
1214                 _ver_warned = 1
1215             self.get = self._slow_get
1216         else:
1217             self.p = self.inprogress = None
1218             self.get = self._fast_get
1219
1220     def _abort(self):
1221         if self.p:
1222             self.p.stdout.close()
1223             self.p.stdin.close()
1224         self.p = None
1225         self.inprogress = None
1226
1227     def _restart(self):
1228         self._abort()
1229         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1230                                   stdin=subprocess.PIPE,
1231                                   stdout=subprocess.PIPE,
1232                                   close_fds = True,
1233                                   bufsize = 4096,
1234                                   preexec_fn = _gitenv)
1235
1236     def _fast_get(self, id):
1237         if not self.p or self.p.poll() != None:
1238             self._restart()
1239         assert(self.p)
1240         assert(self.p.poll() == None)
1241         if self.inprogress:
1242             log('_fast_get: opening %r while %r is open'
1243                 % (id, self.inprogress))
1244         assert(not self.inprogress)
1245         assert(id.find('\n') < 0)
1246         assert(id.find('\r') < 0)
1247         assert(not id.startswith('-'))
1248         self.inprogress = id
1249         self.p.stdin.write('%s\n' % id)
1250         self.p.stdin.flush()
1251         hdr = self.p.stdout.readline()
1252         if hdr.endswith(' missing\n'):
1253             self.inprogress = None
1254             raise KeyError('blob %r is missing' % id)
1255         spl = hdr.split(' ')
1256         if len(spl) != 3 or len(spl[0]) != 40:
1257             raise GitError('expected blob, got %r' % spl)
1258         (hex, type, size) = spl
1259
1260         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1261                            onabort = self._abort)
1262         try:
1263             yield type
1264             for blob in it:
1265                 yield blob
1266             assert(self.p.stdout.readline() == '\n')
1267             self.inprogress = None
1268         except Exception, e:
1269             it.abort()
1270             raise
1271
1272     def _slow_get(self, id):
1273         assert(id.find('\n') < 0)
1274         assert(id.find('\r') < 0)
1275         assert(id[0] != '-')
1276         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1277         yield type
1278
1279         p = subprocess.Popen(['git', 'cat-file', type, id],
1280                              stdout=subprocess.PIPE,
1281                              preexec_fn = _gitenv)
1282         for blob in chunkyreader(p.stdout):
1283             yield blob
1284         _git_wait('git cat-file', p)
1285
1286     def _join(self, it):
1287         type = it.next()
1288         if type == 'blob':
1289             for blob in it:
1290                 yield blob
1291         elif type == 'tree':
1292             treefile = ''.join(it)
1293             for (mode, name, sha) in treeparse(treefile):
1294                 for blob in self.join(sha.encode('hex')):
1295                     yield blob
1296         elif type == 'commit':
1297             treeline = ''.join(it).split('\n')[0]
1298             assert(treeline.startswith('tree '))
1299             for blob in self.join(treeline[5:]):
1300                 yield blob
1301         else:
1302             raise GitError('invalid object type %r: expected blob/tree/commit'
1303                            % type)
1304
1305     def join(self, id):
1306         """Generate a list of the content of all blobs that can be reached
1307         from an object.  The hash given in 'id' must point to a blob, a tree
1308         or a commit. The content of all blobs that can be seen from trees or
1309         commits will be added to the list.
1310         """
1311         try:
1312             for d in self._join(self.get(id)):
1313                 yield d
1314         except StopIteration:
1315             log('booger!\n')
1316
1317 def tags():
1318     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1319     tags = {}
1320     for (n,c) in list_refs():
1321         if n.startswith('refs/tags/'):
1322             name = n[10:]
1323             if not c in tags:
1324                 tags[c] = []
1325
1326             tags[c].append(name)  # more than one tag can point at 'c'
1327
1328     return tags