lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path
   8
   9 MIDX_VERSION = 4
  10
  11 """Discussion of bloom constants for bup:
  12
  13 There are four basic things to consider when building a bloom filter:
  14 The size, in bits, of the filter
  15 The capacity, in entries, of the filter
  16 The probability of a false positive that is tolerable
  17 The number of bits readily available to use for addresing filter bits
  18
  19 There is one major tunable that is not directly related to the above:
  20 k: the number of bits set in the filter per entry
  21
  22 Here's a wall of numbers showing the relationship between k; the ratio between
  23 the filter size in bits and the entries in the filter; and pfalse_positive:
  24
  25 mn|k=3    |k=4    |k=5    |k=6    |k=7    |k=8    |k=9    |k=10   |k=11
  26  8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
  27  9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
  28 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
  29 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
  30 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
  31 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
  32 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
  33 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
  34 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
  35 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
  36 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
  37 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
  38 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
  39 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
  40 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
  41 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
  42 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
  43 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
  44 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
  45 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
  46 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
  47 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
  48 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
  49 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
  50 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
  51
  52 Here's a table showing available repository size for a given pfalse_positive
  53 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
  54 filter and 8192bytes per object):
  55
  56 pfalse|obj k=4     |cap k=4    |obj k=5  |cap k=5    |obj k=6 |cap k=6
  57 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
  58 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
  59 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
  60
  61 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
  62 addressing.
  63
  64 filter size scales linearly with repository size for a given k and pfalse.
  65
  66 Here's a table of filter sizes for a 1 TiB repository:
  67
  68 pfalse| k=3        | k=4        | k=5        | k=6
  69 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
  70 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
  71 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
  72
  73 For bup:
  74 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
  75 per lookup will be worse than the two required for midx.
  76 * We want the pfalse_positive to be low enough that the cost of sometimes
  77 faulting on the midx doesn't overcome the benefit of the bloom filter.
  78 * We have readily available 160 bits for addressing the filter.
  79 * We want to be able to have a single bloom address entire repositories of
  80 reasonable size.
  81
  82 Based on these parameters, a combination of k=4 and k=5 provides the behavior
  83 that bup needs.  As such, I've implemented bloom addressing, adding and
  84 checking functions in C for these two values.  Because k=5 requires less space
  85 and gives better overall pfalse_positive perofrmance, it is preferred if a
  86 table with k=5 can represent the repository.
  87
  88 None of this tells us what max_pfalse_positive to choose.
  89
  90 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
  91 """
  92 BLOOM_VERSION = 2
  93 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
  94 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
  95 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
  96
  97 verbose = 0
  98 ignore_midx = 0
  99 home_repodir = os.path.expanduser('~/.bup')
 100 repodir = None
 101
 102 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
 103 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
 104
 105 _total_searches = 0
 106 _total_steps = 0
 107
 108
 109 class GitError(Exception):
 110     pass
 111
 112
 113 def repo(sub = ''):
 114     """Get the path to the git repository or one of its subdirectories."""
 115     global repodir
 116     if not repodir:
 117         raise GitError('You should call check_repo_or_die()')
 118
 119     # If there's a .git subdirectory, then the actual repo is in there.
 120     gd = os.path.join(repodir, '.git')
 121     if os.path.exists(gd):
 122         repodir = gd
 123
 124     return os.path.join(repodir, sub)
 125
 126
 127 def auto_midx(objdir):
 128     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 129     try:
 130         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 131     except OSError, e:
 132         # make sure 'args' gets printed to help with debugging
 133         add_error('%r: exception: %s' % (args, e))
 134         raise
 135     if rv:
 136         add_error('%r: returned %d' % (args, rv))
 137
 138     args = [path.exe(), 'bloom', '--dir', objdir]
 139     try:
 140         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 141     except OSError, e:
 142         # make sure 'args' gets printed to help with debugging
 143         add_error('%r: exception: %s' % (args, e))
 144         raise
 145     if rv:
 146         add_error('%r: returned %d' % (args, rv))
 147
 148
 149 def mangle_name(name, mode, gitmode):
 150     """Mangle a file name to present an abstract name for segmented files.
 151     Mangled file names will have the ".bup" extension added to them. If a
 152     file's name already ends with ".bup", a ".bupl" extension is added to
 153     disambiguate normal files from semgmented ones.
 154     """
 155     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 156         return name + '.bup'
 157     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 158         return name + '.bupl'
 159     else:
 160         return name
 161
 162
 163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 164 def demangle_name(name):
 165     """Remove name mangling from a file name, if necessary.
 166
 167     The return value is a tuple (demangled_filename,mode), where mode is one of
 168     the following:
 169
 170     * BUP_NORMAL  : files that should be read as-is from the repository
 171     * BUP_CHUNKED : files that were chunked and need to be assembled
 172
 173     For more information on the name mangling algorythm, see mangle_name()
 174     """
 175     if name.endswith('.bupl'):
 176         return (name[:-5], BUP_NORMAL)
 177     elif name.endswith('.bup'):
 178         return (name[:-4], BUP_CHUNKED)
 179     else:
 180         return (name, BUP_NORMAL)
 181
 182
 183 def _encode_packobj(type, content):
 184     szout = ''
 185     sz = len(content)
 186     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 187     sz >>= 4
 188     while 1:
 189         if sz: szbits |= 0x80
 190         szout += chr(szbits)
 191         if not sz:
 192             break
 193         szbits = sz & 0x7f
 194         sz >>= 7
 195     z = zlib.compressobj(1)
 196     yield szout
 197     yield z.compress(content)
 198     yield z.flush()
 199
 200
 201 def _encode_looseobj(type, content):
 202     z = zlib.compressobj(1)
 203     yield z.compress('%s %d\0' % (type, len(content)))
 204     yield z.compress(content)
 205     yield z.flush()
 206
 207
 208 def _decode_looseobj(buf):
 209     assert(buf);
 210     s = zlib.decompress(buf)
 211     i = s.find('\0')
 212     assert(i > 0)
 213     l = s[:i].split(' ')
 214     type = l[0]
 215     sz = int(l[1])
 216     content = s[i+1:]
 217     assert(type in _typemap)
 218     assert(sz == len(content))
 219     return (type, content)
 220
 221
 222 def _decode_packobj(buf):
 223     assert(buf)
 224     c = ord(buf[0])
 225     type = _typermap[(c & 0x70) >> 4]
 226     sz = c & 0x0f
 227     shift = 4
 228     i = 0
 229     while c & 0x80:
 230         i += 1
 231         c = ord(buf[i])
 232         sz |= (c & 0x7f) << shift
 233         shift += 7
 234         if not (c & 0x80):
 235             break
 236     return (type, zlib.decompress(buf[i+1:]))
 237
 238
 239 class PackIdx:
 240     def __init__(self):
 241         assert(0)
 242
 243     def find_offset(self, hash):
 244         """Get the offset of an object inside the index file."""
 245         idx = self._idx_from_hash(hash)
 246         if idx != None:
 247             return self._ofs_from_idx(idx)
 248         return None
 249
 250     def exists(self, hash, want_source=False):
 251         """Return nonempty if the object exists in this index."""
 252         if hash and (self._idx_from_hash(hash) != None):
 253             return want_source and self.name or True
 254         return None
 255
 256     def __len__(self):
 257         return int(self.fanout[255])
 258
 259     def _idx_from_hash(self, hash):
 260         global _total_searches, _total_steps
 261         _total_searches += 1
 262         assert(len(hash) == 20)
 263         b1 = ord(hash[0])
 264         start = self.fanout[b1-1] # range -1..254
 265         end = self.fanout[b1] # range 0..255
 266         want = str(hash)
 267         _total_steps += 1  # lookup table is a step
 268         while start < end:
 269             _total_steps += 1
 270             mid = start + (end-start)/2
 271             v = self._idx_to_hash(mid)
 272             if v < want:
 273                 start = mid+1
 274             elif v > want:
 275                 end = mid
 276             else: # got it!
 277                 return mid
 278         return None
 279
 280
 281 class PackIdxV1(PackIdx):
 282     """Object representation of a Git pack index (version 1) file."""
 283     def __init__(self, filename, f):
 284         self.name = filename
 285         self.idxnames = [self.name]
 286         self.map = mmap_read(f)
 287         self.fanout = list(struct.unpack('!256I',
 288                                          str(buffer(self.map, 0, 256*4))))
 289         self.fanout.append(0)  # entry "-1"
 290         nsha = self.fanout[255]
 291         self.sha_ofs = 256*4
 292         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 293
 294     def _ofs_from_idx(self, idx):
 295         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 296
 297     def _idx_to_hash(self, idx):
 298         return str(self.shatable[idx*24+4 : idx*24+24])
 299
 300     def __iter__(self):
 301         for i in xrange(self.fanout[255]):
 302             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 303
 304
 305 class PackIdxV2(PackIdx):
 306     """Object representation of a Git pack index (version 2) file."""
 307     def __init__(self, filename, f):
 308         self.name = filename
 309         self.idxnames = [self.name]
 310         self.map = mmap_read(f)
 311         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 312         self.fanout = list(struct.unpack('!256I',
 313                                          str(buffer(self.map, 8, 256*4))))
 314         self.fanout.append(0)  # entry "-1"
 315         nsha = self.fanout[255]
 316         self.sha_ofs = 8 + 256*4
 317         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 318         self.ofstable = buffer(self.map,
 319                                self.sha_ofs + nsha*20 + nsha*4,
 320                                nsha*4)
 321         self.ofs64table = buffer(self.map,
 322                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 323
 324     def _ofs_from_idx(self, idx):
 325         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 326         if ofs & 0x80000000:
 327             idx64 = ofs & 0x7fffffff
 328             ofs = struct.unpack('!Q',
 329                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 330         return ofs
 331
 332     def _idx_to_hash(self, idx):
 333         return str(self.shatable[idx*20:(idx+1)*20])
 334
 335     def __iter__(self):
 336         for i in xrange(self.fanout[255]):
 337             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 338
 339
 340 extract_bits = _helpers.extract_bits
 341
 342 bloom_contains = _helpers.bloom_contains
 343 bloom_add = _helpers.bloom_add
 344
 345
 346 class ShaBloom:
 347     """Wrapper which contains data from multiple index files.
 348     """
 349     def __init__(self, filename, f=None, readwrite=False, expected=-1):
 350         self.name = filename
 351         self.rwfile = None
 352         self.map = None
 353         assert(filename.endswith('.bloom'))
 354         if readwrite:
 355             assert(expected > 0)
 356             self.rwfile = f = f or open(filename, 'r+b')
 357             f.seek(0)
 358
 359             # Decide if we want to mmap() the pages as writable ('immediate'
 360             # write) or else map them privately for later writing back to
 361             # the file ('delayed' write).  A bloom table's write access
 362             # pattern is such that we dirty almost all the pages after adding
 363             # very few entries.  But the table is so big that dirtying
 364             # *all* the pages often exceeds Linux's default
 365             # /proc/sys/vm/dirty_ratio or /proc/sys/vm/dirty_background_ratio,
 366             # thus causing it to start flushing the table before we're
 367             # finished... even though there's more than enough space to
 368             # store the bloom table in RAM.
 369             #
 370             # To work around that behaviour, if we calculate that we'll
 371             # probably end up touching the whole table anyway (at least
 372             # one bit flipped per memory page), let's use a "private" mmap,
 373             # which defeats Linux's ability to flush it to disk.  Then we'll
 374             # flush it as one big lump during close().
 375             pages = os.fstat(f.fileno()).st_size / 4096 * 5 # assume k=5
 376             self.delaywrite = expected > pages
 377             debug1('bloom: delaywrite=%r\n' % self.delaywrite)
 378             if self.delaywrite:
 379                 self.map = mmap_readwrite_private(self.rwfile, close=False)
 380             else:
 381                 self.map = mmap_readwrite(self.rwfile, close=False)
 382         else:
 383             self.rwfile = None
 384             f = f or open(filename, 'rb')
 385             self.map = mmap_read(f)
 386         got = str(self.map[0:4])
 387         if got != 'BLOM':
 388             log('Warning: invalid BLOM header (%r) in %r\n' % (got, filename))
 389             return self._init_failed()
 390         ver = struct.unpack('!I', self.map[4:8])[0]
 391         if ver < BLOOM_VERSION:
 392             log('Warning: ignoring old-style (v%d) bloom %r\n'
 393                 % (ver, filename))
 394             return self._init_failed()
 395         if ver > BLOOM_VERSION:
 396             log('Warning: ignoring too-new (v%d) bloom %r\n'
 397                 % (ver, filename))
 398             return self._init_failed()
 399
 400         self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
 401         idxnamestr = str(self.map[16 + 2**self.bits:])
 402         if idxnamestr:
 403             self.idxnames = idxnamestr.split('\0')
 404         else:
 405             self.idxnames = []
 406
 407     def _init_failed(self):
 408         if self.map:
 409             self.map = None
 410         if self.rwfile:
 411             self.rwfile.close()
 412             self.rwfile = None
 413         self.idxnames = []
 414         self.bits = self.entries = 0
 415
 416     def valid(self):
 417         return self.map and self.bits
 418
 419     def __del__(self):
 420         self.close()
 421
 422     def close(self):
 423         if self.map and self.rwfile:
 424             debug2("bloom: closing with %d entries\n" % self.entries)
 425             self.map[12:16] = struct.pack('!I', self.entries)
 426             if self.delaywrite:
 427                 self.rwfile.seek(0)
 428                 self.rwfile.write(self.map)
 429             else:
 430                 self.map.flush()
 431             self.rwfile.seek(16 + 2**self.bits)
 432             if self.idxnames:
 433                 self.rwfile.write('\0'.join(self.idxnames))
 434         self._init_failed()
 435
 436     def pfalse_positive(self, additional=0):
 437         n = self.entries + additional
 438         m = 8*2**self.bits
 439         k = self.k
 440         return 100*(1-math.exp(-k*float(n)/m))**k
 441
 442     def add_idx(self, ix):
 443         """Add the object to the filter, return current pfalse_positive."""
 444         if not self.map: raise Exception, "Cannot add to closed bloom"
 445         self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
 446         self.idxnames.append(os.path.basename(ix.name))
 447
 448     def exists(self, sha):
 449         """Return nonempty if the object probably exists in the bloom filter."""
 450         global _total_searches, _total_steps
 451         _total_searches += 1
 452         if not self.map: return None
 453         found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
 454         _total_steps += steps
 455         return found
 456
 457     @classmethod
 458     def create(cls, name, expected, delaywrite=None, f=None, k=None):
 459         """Create and return a bloom filter for `expected` entries."""
 460         bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
 461         k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
 462         if bits > MAX_BLOOM_BITS[k]:
 463             log('bloom: warning, max bits exceeded, non-optimal\n')
 464             bits = MAX_BLOOM_BITS[k]
 465         debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
 466         f = f or open(name, 'w+b')
 467         f.write('BLOM')
 468         f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
 469         assert(f.tell() == 16)
 470         # NOTE: On some systems this will not extend+zerofill, but it does on
 471         # darwin, linux, bsd and solaris.
 472         f.truncate(16+2**bits)
 473         f.seek(0)
 474         if delaywrite != None and not delaywrite:
 475             # tell it to expect very few objects, forcing a direct mmap
 476             expected = 1
 477         return cls(name, f=f, readwrite=True, expected=expected)
 478
 479     def __len__(self):
 480         return self.entries
 481
 482
 483 class PackMidx:
 484     """Wrapper which contains data from multiple index files.
 485     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 486     and make it possible for bup to expand Git's indexing capabilities to vast
 487     amounts of files.
 488     """
 489     def __init__(self, filename):
 490         self.name = filename
 491         self.force_keep = False
 492         assert(filename.endswith('.midx'))
 493         self.map = mmap_read(open(filename))
 494         if str(self.map[0:4]) != 'MIDX':
 495             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 496             self.force_keep = True
 497             return self._init_failed()
 498         ver = struct.unpack('!I', self.map[4:8])[0]
 499         if ver < MIDX_VERSION:
 500             log('Warning: ignoring old-style (v%d) midx %r\n'
 501                 % (ver, filename))
 502             self.force_keep = False  # old stuff is boring
 503             return self._init_failed()
 504         if ver > MIDX_VERSION:
 505             log('Warning: ignoring too-new (v%d) midx %r\n'
 506                 % (ver, filename))
 507             self.force_keep = True  # new stuff is exciting
 508             return self._init_failed()
 509
 510         self.bits = _helpers.firstword(self.map[8:12])
 511         self.entries = 2**self.bits
 512         self.fanout = buffer(self.map, 12, self.entries*4)
 513         self.sha_ofs = 12 + self.entries*4
 514         self.nsha = nsha = self._fanget(self.entries-1)
 515         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 516         self.whichlist = buffer(self.map, self.sha_ofs + nsha*20, nsha*4)
 517         self.idxname_ofs = self.sha_ofs + 24*nsha
 518         self.idxnames = str(self.map[self.idxname_ofs:]).split('\0')
 519
 520     def _init_failed(self):
 521         self.bits = 0
 522         self.entries = 1
 523         self.fanout = buffer('\0\0\0\0')
 524         self.shatable = buffer('\0'*20)
 525         self.idxnames = []
 526
 527     def _fanget(self, i):
 528         start = i*4
 529         s = self.fanout[start:start+4]
 530         return _helpers.firstword(s)
 531
 532     def _get(self, i):
 533         return str(self.shatable[i*20:(i+1)*20])
 534
 535     def _get_idx_i(self, i):
 536         return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
 537
 538     def _get_idxname(self, i):
 539         return self.idxnames[self._get_idx_i(i)]
 540
 541     def exists(self, hash, want_source=False):
 542         """Return nonempty if the object exists in the index files."""
 543         global _total_searches, _total_steps
 544         _total_searches += 1
 545         want = str(hash)
 546         el = extract_bits(want, self.bits)
 547         if el:
 548             start = self._fanget(el-1)
 549             startv = el << (32-self.bits)
 550         else:
 551             start = 0
 552             startv = 0
 553         end = self._fanget(el)
 554         endv = (el+1) << (32-self.bits)
 555         _total_steps += 1   # lookup table is a step
 556         hashv = _helpers.firstword(hash)
 557         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 558         while start < end:
 559             _total_steps += 1
 560             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 561             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 562             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 563             v = self._get(mid)
 564             #print '    %08x' % self._num(v)
 565             if v < want:
 566                 start = mid+1
 567                 startv = _helpers.firstword(v)
 568             elif v > want:
 569                 end = mid
 570                 endv = _helpers.firstword(v)
 571             else: # got it!
 572                 return want_source and self._get_idxname(mid) or True
 573         return None
 574
 575     def __iter__(self):
 576         for i in xrange(self._fanget(self.entries-1)):
 577             yield buffer(self.shatable, i*20, 20)
 578
 579     def __len__(self):
 580         return int(self._fanget(self.entries-1))
 581
 582
 583 _mpi_count = 0
 584 class PackIdxList:
 585     def __init__(self, dir):
 586         global _mpi_count
 587         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 588         _mpi_count += 1
 589         self.dir = dir
 590         self.also = set()
 591         self.packs = []
 592         self.do_bloom = False
 593         self.bloom = None
 594         self.refresh()
 595
 596     def __del__(self):
 597         global _mpi_count
 598         _mpi_count -= 1
 599         assert(_mpi_count == 0)
 600
 601     def __iter__(self):
 602         return iter(idxmerge(self.packs))
 603
 604     def __len__(self):
 605         return sum(len(pack) for pack in self.packs)
 606
 607     def exists(self, hash, want_source=False):
 608         """Return nonempty if the object exists in the index files."""
 609         global _total_searches
 610         _total_searches += 1
 611         if hash in self.also:
 612             return True
 613         if self.do_bloom and self.bloom is not None:
 614             _total_searches -= 1  # will be incremented by bloom
 615             if self.bloom.exists(hash):
 616                 self.do_bloom = False
 617             else:
 618                 return None
 619         for i in xrange(len(self.packs)):
 620             p = self.packs[i]
 621             _total_searches -= 1  # will be incremented by sub-pack
 622             ix = p.exists(hash, want_source=want_source)
 623             if ix:
 624                 # reorder so most recently used packs are searched first
 625                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 626                 return ix
 627         self.do_bloom = True
 628         return None
 629
 630     def refresh(self, skip_midx = False):
 631         """Refresh the index list.
 632         This method verifies if .midx files were superseded (e.g. all of its
 633         contents are in another, bigger .midx file) and removes the superseded
 634         files.
 635
 636         If skip_midx is True, all work on .midx files will be skipped and .midx
 637         files will be removed from the list.
 638
 639         The module-global variable 'ignore_midx' can force this function to
 640         always act as if skip_midx was True.
 641         """
 642         self.bloom = None # Always reopen the bloom as it may have been relaced
 643         self.do_bloom = False
 644         skip_midx = skip_midx or ignore_midx
 645         d = dict((p.name, p) for p in self.packs
 646                  if not skip_midx or not isinstance(p, PackMidx))
 647         if os.path.exists(self.dir):
 648             if not skip_midx:
 649                 midxl = []
 650                 for ix in self.packs:
 651                     if isinstance(ix, PackMidx):
 652                         for name in ix.idxnames:
 653                             d[os.path.join(self.dir, name)] = ix
 654                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 655                     if not d.get(full):
 656                         mx = PackMidx(full)
 657                         (mxd, mxf) = os.path.split(mx.name)
 658                         broken = False
 659                         for n in mx.idxnames:
 660                             if not os.path.exists(os.path.join(mxd, n)):
 661                                 log(('warning: index %s missing\n' +
 662                                     '  used by %s\n') % (n, mxf))
 663                                 broken = True
 664                         if broken:
 665                             del mx
 666                             unlink(full)
 667                         else:
 668                             midxl.append(mx)
 669                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 670                 for ix in midxl:
 671                     any_needed = False
 672                     for sub in ix.idxnames:
 673                         found = d.get(os.path.join(self.dir, sub))
 674                         if not found or isinstance(found, PackIdx):
 675                             # doesn't exist, or exists but not in a midx
 676                             any_needed = True
 677                             break
 678                     if any_needed:
 679                         d[ix.name] = ix
 680                         for name in ix.idxnames:
 681                             d[os.path.join(self.dir, name)] = ix
 682                     elif not ix.force_keep:
 683                         debug1('midx: removing redundant: %s\n'
 684                                % os.path.basename(ix.name))
 685                         unlink(ix.name)
 686             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 687                 if not d.get(full):
 688                     try:
 689                         ix = open_idx(full)
 690                     except GitError, e:
 691                         add_error(e)
 692                         continue
 693                     d[full] = ix
 694             bfull = os.path.join(self.dir, 'bup.bloom')
 695             if self.bloom is None and os.path.exists(bfull):
 696                 self.bloom = ShaBloom(bfull)
 697             self.packs = list(set(d.values()))
 698             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 699             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 700                 self.do_bloom = True
 701             else:
 702                 self.bloom = None
 703         debug1('PackIdxList: using %d index%s.\n'
 704             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 705
 706     def add(self, hash):
 707         """Insert an additional object in the list."""
 708         self.also.add(hash)
 709
 710
 711 def calc_hash(type, content):
 712     """Calculate some content's hash in the Git fashion."""
 713     header = '%s %d\0' % (type, len(content))
 714     sum = Sha1(header)
 715     sum.update(content)
 716     return sum.digest()
 717
 718
 719 def _shalist_sort_key(ent):
 720     (mode, name, id) = ent
 721     if stat.S_ISDIR(int(mode, 8)):
 722         return name + '/'
 723     else:
 724         return name
 725
 726
 727 def open_idx(filename):
 728     if filename.endswith('.idx'):
 729         f = open(filename, 'rb')
 730         header = f.read(8)
 731         if header[0:4] == '\377tOc':
 732             version = struct.unpack('!I', header[4:8])[0]
 733             if version == 2:
 734                 return PackIdxV2(filename, f)
 735             else:
 736                 raise GitError('%s: expected idx file version 2, got %d'
 737                                % (filename, version))
 738         elif len(header) == 8 and header[0:4] < '\377tOc':
 739             return PackIdxV1(filename, f)
 740         else:
 741             raise GitError('%s: unrecognized idx file header' % filename)
 742     elif filename.endswith('.midx'):
 743         return PackMidx(filename)
 744     else:
 745         raise GitError('idx filenames must end with .idx or .midx')
 746
 747
 748 def idxmerge(idxlist, final_progress=True):
 749     """Generate a list of all the objects reachable in a PackIdxList."""
 750     def pfunc(count, total):
 751         progress('Reading indexes: %.2f%% (%d/%d)\r'
 752                  % (count*100.0/total, count, total))
 753     def pfinal(count, total):
 754         if final_progress:
 755             log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 756     return merge_iter(idxlist, 10024, pfunc, pfinal)
 757
 758
 759 def _make_objcache():
 760     return PackIdxList(repo('objects/pack'))
 761
 762 class PackWriter:
 763     """Writes Git objects insid a pack file."""
 764     def __init__(self, objcache_maker=_make_objcache):
 765         self.count = 0
 766         self.outbytes = 0
 767         self.filename = None
 768         self.file = None
 769         self.idx = None
 770         self.objcache_maker = objcache_maker
 771         self.objcache = None
 772
 773     def __del__(self):
 774         self.close()
 775
 776     def _open(self):
 777         if not self.file:
 778             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 779             self.file = os.fdopen(fd, 'w+b')
 780             assert(name.endswith('.pack'))
 781             self.filename = name[:-5]
 782             self.file.write('PACK\0\0\0\2\0\0\0\0')
 783             self.idx = list(list() for i in xrange(256))
 784
 785     # the 'sha' parameter is used in client.py's _raw_write(), but not needed
 786     # in this basic version.
 787     def _raw_write(self, datalist, sha):
 788         self._open()
 789         f = self.file
 790         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 791         # the file never has a *partial* blob.  So let's make sure it's
 792         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 793         # to our hashsplit algorithm.)  f.write() does its own buffering,
 794         # but that's okay because we'll flush it in _end().
 795         oneblob = ''.join(datalist)
 796         try:
 797             f.write(oneblob)
 798         except IOError, e:
 799             raise GitError, e, sys.exc_info()[2]
 800         nw = len(oneblob)
 801         crc = zlib.crc32(oneblob) & 0xffffffff
 802         self._update_idx(sha, crc, nw)
 803         self.outbytes += nw
 804         self.count += 1
 805         return nw, crc
 806
 807     def _update_idx(self, sha, crc, size):
 808         assert(sha)
 809         if self.idx:
 810             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 811
 812     def _write(self, sha, type, content):
 813         if verbose:
 814             log('>')
 815         if not sha:
 816             sha = calc_hash(type, content)
 817         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 818         return sha
 819
 820     def breakpoint(self):
 821         """Clear byte and object counts and return the last processed id."""
 822         id = self._end()
 823         self.outbytes = self.count = 0
 824         return id
 825
 826     def _require_objcache(self):
 827         if self.objcache is None and self.objcache_maker:
 828             self.objcache = self.objcache_maker()
 829         if self.objcache is None:
 830             raise GitError(
 831                     "PackWriter not opened or can't check exists w/o objcache")
 832
 833     def exists(self, id, want_source=False):
 834         """Return non-empty if an object is found in the object cache."""
 835         self._require_objcache()
 836         return self.objcache.exists(id, want_source=want_source)
 837
 838     def maybe_write(self, type, content):
 839         """Write an object to the pack file if not present and return its id."""
 840         self._require_objcache()
 841         sha = calc_hash(type, content)
 842         if not self.exists(sha):
 843             self._write(sha, type, content)
 844             self.objcache.add(sha)
 845         return sha
 846
 847     def new_blob(self, blob):
 848         """Create a blob object in the pack with the supplied content."""
 849         return self.maybe_write('blob', blob)
 850
 851     def new_tree(self, shalist):
 852         """Create a tree object in the pack."""
 853         shalist = sorted(shalist, key = _shalist_sort_key)
 854         l = []
 855         for (mode,name,bin) in shalist:
 856             assert(mode)
 857             assert(mode != '0')
 858             assert(mode[0] != '0')
 859             assert(name)
 860             assert(len(bin) == 20)
 861             l.append('%s %s\0%s' % (mode,name,bin))
 862         return self.maybe_write('tree', ''.join(l))
 863
 864     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 865         l = []
 866         if tree: l.append('tree %s' % tree.encode('hex'))
 867         if parent: l.append('parent %s' % parent.encode('hex'))
 868         if author: l.append('author %s %s' % (author, _git_date(adate)))
 869         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 870         l.append('')
 871         l.append(msg)
 872         return self.maybe_write('commit', '\n'.join(l))
 873
 874     def new_commit(self, parent, tree, date, msg):
 875         """Create a commit object in the pack."""
 876         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 877         commit = self._new_commit(tree, parent,
 878                                   userline, date, userline, date,
 879                                   msg)
 880         return commit
 881
 882     def abort(self):
 883         """Remove the pack file from disk."""
 884         f = self.file
 885         if f:
 886             self.idx = None
 887             self.file = None
 888             f.close()
 889             os.unlink(self.filename + '.pack')
 890
 891     def _end(self, run_midx=True):
 892         f = self.file
 893         if not f: return None
 894         self.file = None
 895         self.objcache = None
 896         idx = self.idx
 897         self.idx = None
 898
 899         # update object count
 900         f.seek(8)
 901         cp = struct.pack('!i', self.count)
 902         assert(len(cp) == 4)
 903         f.write(cp)
 904
 905         # calculate the pack sha1sum
 906         f.seek(0)
 907         sum = Sha1()
 908         for b in chunkyreader(f):
 909             sum.update(b)
 910         packbin = sum.digest()
 911         f.write(packbin)
 912         f.close()
 913
 914         idx_f = open(self.filename + '.idx', 'wb')
 915         obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
 916         idx_f.close()
 917
 918         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 919         if os.path.exists(self.filename + '.map'):
 920             os.unlink(self.filename + '.map')
 921         os.rename(self.filename + '.pack', nameprefix + '.pack')
 922         os.rename(self.filename + '.idx', nameprefix + '.idx')
 923
 924         if run_midx:
 925             auto_midx(repo('objects/pack'))
 926         return nameprefix
 927
 928     def close(self, run_midx=True):
 929         """Close the pack file and move it to its definitive path."""
 930         return self._end(run_midx=run_midx)
 931
 932     def _write_pack_idx_v2(self, file, idx, packbin):
 933         sum = Sha1()
 934
 935         def write(data):
 936             file.write(data)
 937             sum.update(data)
 938
 939         write('\377tOc\0\0\0\2')
 940
 941         n = 0
 942         for part in idx:
 943             n += len(part)
 944             write(struct.pack('!i', n))
 945             part.sort(key=lambda x: x[0])
 946
 947         obj_list_sum = Sha1()
 948         for part in idx:
 949             for entry in part:
 950                 write(entry[0])
 951                 obj_list_sum.update(entry[0])
 952         for part in idx:
 953             for entry in part:
 954                 write(struct.pack('!I', entry[1]))
 955         ofs64_list = []
 956         for part in idx:
 957             for entry in part:
 958                 if entry[2] & 0x80000000:
 959                     write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
 960                     ofs64_list.append(struct.pack('!Q', entry[2]))
 961                 else:
 962                     write(struct.pack('!i', entry[2]))
 963         for ofs64 in ofs64_list:
 964             write(ofs64)
 965
 966         write(packbin)
 967         file.write(sum.digest())
 968         return obj_list_sum.hexdigest()
 969
 970
 971 def _git_date(date):
 972     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 973
 974
 975 def _gitenv():
 976     os.environ['GIT_DIR'] = os.path.abspath(repo())
 977
 978
 979 def list_refs(refname = None):
 980     """Generate a list of tuples in the form (refname,hash).
 981     If a ref name is specified, list only this particular ref.
 982     """
 983     argv = ['git', 'show-ref', '--']
 984     if refname:
 985         argv += [refname]
 986     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 987     out = p.stdout.read().strip()
 988     rv = p.wait()  # not fatal
 989     if rv:
 990         assert(not out)
 991     if out:
 992         for d in out.split('\n'):
 993             (sha, name) = d.split(' ', 1)
 994             yield (name, sha.decode('hex'))
 995
 996
 997 def read_ref(refname):
 998     """Get the commit id of the most recent commit made on a given ref."""
 999     l = list(list_refs(refname))
1000     if l:
1001         assert(len(l) == 1)
1002         return l[0][1]
1003     else:
1004         return None
1005
1006
1007 def rev_list(ref, count=None):
1008     """Generate a list of reachable commits in reverse chronological order.
1009
1010     This generator walks through commits, from child to parent, that are
1011     reachable via the specified ref and yields a series of tuples of the form
1012     (date,hash).
1013
1014     If count is a non-zero integer, limit the number of commits to "count"
1015     objects.
1016     """
1017     assert(not ref.startswith('-'))
1018     opts = []
1019     if count:
1020         opts += ['-n', str(atoi(count))]
1021     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
1022     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
1023     commit = None
1024     for row in p.stdout:
1025         s = row.strip()
1026         if s.startswith('commit '):
1027             commit = s[7:].decode('hex')
1028         else:
1029             date = int(s)
1030             yield (date, commit)
1031     rv = p.wait()  # not fatal
1032     if rv:
1033         raise GitError, 'git rev-list returned error %d' % rv
1034
1035
1036 def rev_get_date(ref):
1037     """Get the date of the latest commit on the specified ref."""
1038     for (date, commit) in rev_list(ref, count=1):
1039         return date
1040     raise GitError, 'no such commit %r' % ref
1041
1042
1043 def rev_parse(committish):
1044     """Resolve the full hash for 'committish', if it exists.
1045
1046     Should be roughly equivalent to 'git rev-parse'.
1047
1048     Returns the hex value of the hash if it is found, None if 'committish' does
1049     not correspond to anything.
1050     """
1051     head = read_ref(committish)
1052     if head:
1053         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1054         return head
1055
1056     pL = PackIdxList(repo('objects/pack'))
1057
1058     if len(committish) == 40:
1059         try:
1060             hash = committish.decode('hex')
1061         except TypeError:
1062             return None
1063
1064         if pL.exists(hash):
1065             return hash
1066
1067     return None
1068
1069
1070 def update_ref(refname, newval, oldval):
1071     """Change the commit pointed to by a branch."""
1072     if not oldval:
1073         oldval = ''
1074     assert(refname.startswith('refs/heads/'))
1075     p = subprocess.Popen(['git', 'update-ref', refname,
1076                           newval.encode('hex'), oldval.encode('hex')],
1077                          preexec_fn = _gitenv)
1078     _git_wait('git update-ref', p)
1079
1080
1081 def guess_repo(path=None):
1082     """Set the path value in the global variable "repodir".
1083     This makes bup look for an existing bup repository, but not fail if a
1084     repository doesn't exist. Usually, if you are interacting with a bup
1085     repository, you would not be calling this function but using
1086     check_repo_or_die().
1087     """
1088     global repodir
1089     if path:
1090         repodir = path
1091     if not repodir:
1092         repodir = os.environ.get('BUP_DIR')
1093         if not repodir:
1094             repodir = os.path.expanduser('~/.bup')
1095
1096
1097 def init_repo(path=None):
1098     """Create the Git bare repository for bup in a given path."""
1099     guess_repo(path)
1100     d = repo()  # appends a / to the path
1101     parent = os.path.dirname(os.path.dirname(d))
1102     if parent and not os.path.exists(parent):
1103         raise GitError('parent directory "%s" does not exist\n' % parent)
1104     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1105         raise GitError('"%d" exists but is not a directory\n' % d)
1106     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1107                          preexec_fn = _gitenv)
1108     _git_wait('git init', p)
1109     # Force the index version configuration in order to ensure bup works
1110     # regardless of the version of the installed Git binary.
1111     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1112                          stdout=sys.stderr, preexec_fn = _gitenv)
1113     _git_wait('git config', p)
1114
1115
1116 def check_repo_or_die(path=None):
1117     """Make sure a bup repository exists, and abort if not.
1118     If the path to a particular repository was not specified, this function
1119     initializes the default repository automatically.
1120     """
1121     guess_repo(path)
1122     if not os.path.isdir(repo('objects/pack/.')):
1123         if repodir == home_repodir:
1124             init_repo()
1125         else:
1126             log('error: %r is not a bup/git repository\n' % repo())
1127             sys.exit(15)
1128
1129
1130 def treeparse(buf):
1131     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1132     ofs = 0
1133     while ofs < len(buf):
1134         z = buf[ofs:].find('\0')
1135         assert(z > 0)
1136         spl = buf[ofs:ofs+z].split(' ', 1)
1137         assert(len(spl) == 2)
1138         sha = buf[ofs+z+1:ofs+z+1+20]
1139         ofs += z+1+20
1140         yield (spl[0], spl[1], sha)
1141
1142
1143 _ver = None
1144 def ver():
1145     """Get Git's version and ensure a usable version is installed.
1146
1147     The returned version is formatted as an ordered tuple with each position
1148     representing a digit in the version tag. For example, the following tuple
1149     would represent version 1.6.6.9:
1150
1151         ('1', '6', '6', '9')
1152     """
1153     global _ver
1154     if not _ver:
1155         p = subprocess.Popen(['git', '--version'],
1156                              stdout=subprocess.PIPE)
1157         gvs = p.stdout.read()
1158         _git_wait('git --version', p)
1159         m = re.match(r'git version (\S+.\S+)', gvs)
1160         if not m:
1161             raise GitError('git --version weird output: %r' % gvs)
1162         _ver = tuple(m.group(1).split('.'))
1163     needed = ('1','5', '3', '1')
1164     if _ver < needed:
1165         raise GitError('git version %s or higher is required; you have %s'
1166                        % ('.'.join(needed), '.'.join(_ver)))
1167     return _ver
1168
1169
1170 def _git_wait(cmd, p):
1171     rv = p.wait()
1172     if rv != 0:
1173         raise GitError('%s returned %d' % (cmd, rv))
1174
1175
1176 def _git_capture(argv):
1177     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1178     r = p.stdout.read()
1179     _git_wait(repr(argv), p)
1180     return r
1181
1182
1183 class _AbortableIter:
1184     def __init__(self, it, onabort = None):
1185         self.it = it
1186         self.onabort = onabort
1187         self.done = None
1188
1189     def __iter__(self):
1190         return self
1191
1192     def next(self):
1193         try:
1194             return self.it.next()
1195         except StopIteration, e:
1196             self.done = True
1197             raise
1198         except:
1199             self.abort()
1200             raise
1201
1202     def abort(self):
1203         """Abort iteration and call the abortion callback, if needed."""
1204         if not self.done:
1205             self.done = True
1206             if self.onabort:
1207                 self.onabort()
1208
1209     def __del__(self):
1210         self.abort()
1211
1212
1213 _ver_warned = 0
1214 class CatPipe:
1215     """Link to 'git cat-file' that is used to retrieve blob data."""
1216     def __init__(self):
1217         global _ver_warned
1218         wanted = ('1','5','6')
1219         if ver() < wanted:
1220             if not _ver_warned:
1221                 log('warning: git version < %s; bup will be slow.\n'
1222                     % '.'.join(wanted))
1223                 _ver_warned = 1
1224             self.get = self._slow_get
1225         else:
1226             self.p = self.inprogress = None
1227             self.get = self._fast_get
1228
1229     def _abort(self):
1230         if self.p:
1231             self.p.stdout.close()
1232             self.p.stdin.close()
1233         self.p = None
1234         self.inprogress = None
1235
1236     def _restart(self):
1237         self._abort()
1238         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1239                                   stdin=subprocess.PIPE,
1240                                   stdout=subprocess.PIPE,
1241                                   close_fds = True,
1242                                   bufsize = 4096,
1243                                   preexec_fn = _gitenv)
1244
1245     def _fast_get(self, id):
1246         if not self.p or self.p.poll() != None:
1247             self._restart()
1248         assert(self.p)
1249         assert(self.p.poll() == None)
1250         if self.inprogress:
1251             log('_fast_get: opening %r while %r is open'
1252                 % (id, self.inprogress))
1253         assert(not self.inprogress)
1254         assert(id.find('\n') < 0)
1255         assert(id.find('\r') < 0)
1256         assert(not id.startswith('-'))
1257         self.inprogress = id
1258         self.p.stdin.write('%s\n' % id)
1259         self.p.stdin.flush()
1260         hdr = self.p.stdout.readline()
1261         if hdr.endswith(' missing\n'):
1262             self.inprogress = None
1263             raise KeyError('blob %r is missing' % id)
1264         spl = hdr.split(' ')
1265         if len(spl) != 3 or len(spl[0]) != 40:
1266             raise GitError('expected blob, got %r' % spl)
1267         (hex, type, size) = spl
1268
1269         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1270                            onabort = self._abort)
1271         try:
1272             yield type
1273             for blob in it:
1274                 yield blob
1275             assert(self.p.stdout.readline() == '\n')
1276             self.inprogress = None
1277         except Exception, e:
1278             it.abort()
1279             raise
1280
1281     def _slow_get(self, id):
1282         assert(id.find('\n') < 0)
1283         assert(id.find('\r') < 0)
1284         assert(id[0] != '-')
1285         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1286         yield type
1287
1288         p = subprocess.Popen(['git', 'cat-file', type, id],
1289                              stdout=subprocess.PIPE,
1290                              preexec_fn = _gitenv)
1291         for blob in chunkyreader(p.stdout):
1292             yield blob
1293         _git_wait('git cat-file', p)
1294
1295     def _join(self, it):
1296         type = it.next()
1297         if type == 'blob':
1298             for blob in it:
1299                 yield blob
1300         elif type == 'tree':
1301             treefile = ''.join(it)
1302             for (mode, name, sha) in treeparse(treefile):
1303                 for blob in self.join(sha.encode('hex')):
1304                     yield blob
1305         elif type == 'commit':
1306             treeline = ''.join(it).split('\n')[0]
1307             assert(treeline.startswith('tree '))
1308             for blob in self.join(treeline[5:]):
1309                 yield blob
1310         else:
1311             raise GitError('invalid object type %r: expected blob/tree/commit'
1312                            % type)
1313
1314     def join(self, id):
1315         """Generate a list of the content of all blobs that can be reached
1316         from an object.  The hash given in 'id' must point to a blob, a tree
1317         or a commit. The content of all blobs that can be seen from trees or
1318         commits will be added to the list.
1319         """
1320         try:
1321             for d in self._join(self.get(id)):
1322                 yield d
1323         except StopIteration:
1324             log('booger!\n')
1325
1326 def tags():
1327     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1328     tags = {}
1329     for (n,c) in list_refs():
1330         if n.startswith('refs/tags/'):
1331             name = n[10:]
1332             if not c in tags:
1333                 tags[c] = []
1334
1335             tags[c].append(name)  # more than one tag can point at 'c'
1336
1337     return tags