lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path
   8
   9 MIDX_VERSION = 4
  10
  11 """Discussion of bloom constants for bup:
  12
  13 There are four basic things to consider when building a bloom filter:
  14 The size, in bits, of the filter
  15 The capacity, in entries, of the filter
  16 The probability of a false positive that is tolerable
  17 The number of bits readily available to use for addresing filter bits
  18
  19 There is one major tunable that is not directly related to the above:
  20 k: the number of bits set in the filter per entry
  21
  22 Here's a wall of numbers showing the relationship between k; the ratio between
  23 the filter size in bits and the entries in the filter; and pfalse_positive:
  24
  25 mn|k=3    |k=4    |k=5    |k=6    |k=7    |k=8    |k=9    |k=10   |k=11
  26  8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
  27  9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
  28 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
  29 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
  30 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
  31 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
  32 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
  33 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
  34 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
  35 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
  36 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
  37 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
  38 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
  39 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
  40 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
  41 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
  42 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
  43 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
  44 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
  45 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
  46 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
  47 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
  48 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
  49 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
  50 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
  51
  52 Here's a table showing available repository size for a given pfalse_positive
  53 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
  54 filter and 8192bytes per object):
  55
  56 pfalse|obj k=4     |cap k=4    |obj k=5  |cap k=5    |obj k=6 |cap k=6
  57 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
  58 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
  59 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
  60
  61 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
  62 addressing.
  63
  64 filter size scales linearly with repository size for a given k and pfalse.
  65
  66 Here's a table of filter sizes for a 1 TiB repository:
  67
  68 pfalse| k=3        | k=4        | k=5        | k=6
  69 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
  70 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
  71 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
  72
  73 For bup:
  74 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
  75 per lookup will be worse than the two required for midx.
  76 * We want the pfalse_positive to be low enough that the cost of sometimes
  77 faulting on the midx doesn't overcome the benefit of the bloom filter.
  78 * We have readily available 160 bits for addressing the filter.
  79 * We want to be able to have a single bloom address entire repositories of
  80 reasonable size.
  81
  82 Based on these parameters, a combination of k=4 and k=5 provides the behavior
  83 that bup needs.  As such, I've implemented bloom addressing, adding and
  84 checking functions in C for these two values.  Because k=5 requires less space
  85 and gives better overall pfalse_positive perofrmance, it is preferred if a
  86 table with k=5 can represent the repository.
  87
  88 None of this tells us what max_pfalse_positive to choose.
  89
  90 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
  91 """
  92 BLOOM_VERSION = 2
  93 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
  94 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
  95 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
  96
  97 verbose = 0
  98 ignore_midx = 0
  99 home_repodir = os.path.expanduser('~/.bup')
 100 repodir = None
 101
 102 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
 103 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
 104
 105 _total_searches = 0
 106 _total_steps = 0
 107
 108
 109 class GitError(Exception):
 110     pass
 111
 112
 113 def repo(sub = ''):
 114     """Get the path to the git repository or one of its subdirectories."""
 115     global repodir
 116     if not repodir:
 117         raise GitError('You should call check_repo_or_die()')
 118
 119     # If there's a .git subdirectory, then the actual repo is in there.
 120     gd = os.path.join(repodir, '.git')
 121     if os.path.exists(gd):
 122         repodir = gd
 123
 124     return os.path.join(repodir, sub)
 125
 126
 127 def auto_midx(objdir):
 128     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 129     try:
 130         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 131     except OSError, e:
 132         # make sure 'args' gets printed to help with debugging
 133         add_error('%r: exception: %s' % (args, e))
 134         raise
 135     if rv:
 136         add_error('%r: returned %d' % (args, rv))
 137
 138     args = [path.exe(), 'bloom', '--dir', objdir]
 139     try:
 140         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 141     except OSError, e:
 142         # make sure 'args' gets printed to help with debugging
 143         add_error('%r: exception: %s' % (args, e))
 144         raise
 145     if rv:
 146         add_error('%r: returned %d' % (args, rv))
 147
 148
 149 def mangle_name(name, mode, gitmode):
 150     """Mangle a file name to present an abstract name for segmented files.
 151     Mangled file names will have the ".bup" extension added to them. If a
 152     file's name already ends with ".bup", a ".bupl" extension is added to
 153     disambiguate normal files from semgmented ones.
 154     """
 155     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 156         return name + '.bup'
 157     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 158         return name + '.bupl'
 159     else:
 160         return name
 161
 162
 163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 164 def demangle_name(name):
 165     """Remove name mangling from a file name, if necessary.
 166
 167     The return value is a tuple (demangled_filename,mode), where mode is one of
 168     the following:
 169
 170     * BUP_NORMAL  : files that should be read as-is from the repository
 171     * BUP_CHUNKED : files that were chunked and need to be assembled
 172
 173     For more information on the name mangling algorythm, see mangle_name()
 174     """
 175     if name.endswith('.bupl'):
 176         return (name[:-5], BUP_NORMAL)
 177     elif name.endswith('.bup'):
 178         return (name[:-4], BUP_CHUNKED)
 179     else:
 180         return (name, BUP_NORMAL)
 181
 182
 183 def _encode_packobj(type, content):
 184     szout = ''
 185     sz = len(content)
 186     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 187     sz >>= 4
 188     while 1:
 189         if sz: szbits |= 0x80
 190         szout += chr(szbits)
 191         if not sz:
 192             break
 193         szbits = sz & 0x7f
 194         sz >>= 7
 195     z = zlib.compressobj(1)
 196     yield szout
 197     yield z.compress(content)
 198     yield z.flush()
 199
 200
 201 def _encode_looseobj(type, content):
 202     z = zlib.compressobj(1)
 203     yield z.compress('%s %d\0' % (type, len(content)))
 204     yield z.compress(content)
 205     yield z.flush()
 206
 207
 208 def _decode_looseobj(buf):
 209     assert(buf);
 210     s = zlib.decompress(buf)
 211     i = s.find('\0')
 212     assert(i > 0)
 213     l = s[:i].split(' ')
 214     type = l[0]
 215     sz = int(l[1])
 216     content = s[i+1:]
 217     assert(type in _typemap)
 218     assert(sz == len(content))
 219     return (type, content)
 220
 221
 222 def _decode_packobj(buf):
 223     assert(buf)
 224     c = ord(buf[0])
 225     type = _typermap[(c & 0x70) >> 4]
 226     sz = c & 0x0f
 227     shift = 4
 228     i = 0
 229     while c & 0x80:
 230         i += 1
 231         c = ord(buf[i])
 232         sz |= (c & 0x7f) << shift
 233         shift += 7
 234         if not (c & 0x80):
 235             break
 236     return (type, zlib.decompress(buf[i+1:]))
 237
 238
 239 class PackIdx:
 240     def __init__(self):
 241         assert(0)
 242
 243     def find_offset(self, hash):
 244         """Get the offset of an object inside the index file."""
 245         idx = self._idx_from_hash(hash)
 246         if idx != None:
 247             return self._ofs_from_idx(idx)
 248         return None
 249
 250     def exists(self, hash, want_source=False):
 251         """Return nonempty if the object exists in this index."""
 252         if hash and (self._idx_from_hash(hash) != None):
 253             return want_source and self.name or True
 254         return None
 255
 256     def __len__(self):
 257         return int(self.fanout[255])
 258
 259     def _idx_from_hash(self, hash):
 260         global _total_searches, _total_steps
 261         _total_searches += 1
 262         assert(len(hash) == 20)
 263         b1 = ord(hash[0])
 264         start = self.fanout[b1-1] # range -1..254
 265         end = self.fanout[b1] # range 0..255
 266         want = str(hash)
 267         _total_steps += 1  # lookup table is a step
 268         while start < end:
 269             _total_steps += 1
 270             mid = start + (end-start)/2
 271             v = self._idx_to_hash(mid)
 272             if v < want:
 273                 start = mid+1
 274             elif v > want:
 275                 end = mid
 276             else: # got it!
 277                 return mid
 278         return None
 279
 280     def iter_with_idx_i(self, idx_i):
 281         for e in self:
 282             yield e, idx_i
 283
 284
 285 class PackIdxV1(PackIdx):
 286     """Object representation of a Git pack index (version 1) file."""
 287     def __init__(self, filename, f):
 288         self.name = filename
 289         self.idxnames = [self.name]
 290         self.map = mmap_read(f)
 291         self.fanout = list(struct.unpack('!256I',
 292                                          str(buffer(self.map, 0, 256*4))))
 293         self.fanout.append(0)  # entry "-1"
 294         nsha = self.fanout[255]
 295         self.shatable = buffer(self.map, 256*4, nsha*24)
 296
 297     def _ofs_from_idx(self, idx):
 298         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 299
 300     def _idx_to_hash(self, idx):
 301         return str(self.shatable[idx*24+4 : idx*24+24])
 302
 303     def __iter__(self):
 304         for i in xrange(self.fanout[255]):
 305             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 306
 307
 308 class PackIdxV2(PackIdx):
 309     """Object representation of a Git pack index (version 2) file."""
 310     def __init__(self, filename, f):
 311         self.name = filename
 312         self.idxnames = [self.name]
 313         self.map = mmap_read(f)
 314         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 315         self.fanout = list(struct.unpack('!256I',
 316                                          str(buffer(self.map, 8, 256*4))))
 317         self.fanout.append(0)  # entry "-1"
 318         nsha = self.fanout[255]
 319         self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
 320         self.ofstable = buffer(self.map,
 321                                8 + 256*4 + nsha*20 + nsha*4,
 322                                nsha*4)
 323         self.ofs64table = buffer(self.map,
 324                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 325
 326     def _ofs_from_idx(self, idx):
 327         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 328         if ofs & 0x80000000:
 329             idx64 = ofs & 0x7fffffff
 330             ofs = struct.unpack('!Q',
 331                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 332         return ofs
 333
 334     def _idx_to_hash(self, idx):
 335         return str(self.shatable[idx*20:(idx+1)*20])
 336
 337     def __iter__(self):
 338         for i in xrange(self.fanout[255]):
 339             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 340
 341
 342 extract_bits = _helpers.extract_bits
 343
 344 bloom_contains = _helpers.bloom_contains
 345 bloom_add = _helpers.bloom_add
 346
 347
 348 class ShaBloom:
 349     """Wrapper which contains data from multiple index files.
 350     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 351     and make it possible for bup to expand Git's indexing capabilities to vast
 352     amounts of files.
 353     """
 354     def __init__(self, filename, f=None, readwrite=False):
 355         self.name = filename
 356         assert(filename.endswith('.bloom'))
 357         if readwrite:
 358             self.rwfile = f or open(filename, 'r+b')
 359             self.map = mmap_readwrite(self.rwfile, close=False)
 360         else:
 361             self.rwfile = None
 362             self.map = mmap_read(f or open(filename, 'rb'))
 363         if str(self.map[0:4]) != 'BLOM':
 364             log('Warning: skipping: invalid BLOM header in %r\n' % filename)
 365             return self._init_failed()
 366         ver = struct.unpack('!I', self.map[4:8])[0]
 367         if ver < BLOOM_VERSION:
 368             log('Warning: ignoring old-style (v%d) bloom %r\n'
 369                 % (ver, filename))
 370             return self._init_failed()
 371         if ver > BLOOM_VERSION:
 372             log('Warning: ignoring too-new (v%d) bloom %r\n'
 373                 % (ver, filename))
 374             return self._init_failed()
 375
 376         self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
 377         idxnamestr = str(self.map[16 + 2**self.bits:])
 378         if idxnamestr:
 379             self.idxnames = idxnamestr.split('\0')
 380         else:
 381             self.idxnames = []
 382
 383     def _init_failed(self):
 384         if self.map:
 385             self.map.close()
 386             self.map = None
 387         if self.rwfile:
 388             self.rwfile.close()
 389             self.rwfile = None
 390         self.idxnames = []
 391         self.bits = self.entries = 0
 392
 393     def valid(self):
 394         return self.map and self.bits
 395
 396     def __del__(self):
 397         self.close()
 398
 399     def close(self):
 400         if self.map:
 401             if self.rwfile:
 402                 debug2("bloom: closing with %d entries\n" % self.entries)
 403                 self.map[12:16] = struct.pack('!I', self.entries)
 404                 self.map.flush()
 405         if self.rwfile:
 406             self.rwfile.seek(16 + 2**self.bits)
 407             if self.idxnames:
 408                 self.rwfile.write('\0'.join(self.idxnames))
 409         self._init_failed()
 410
 411     def pfalse_positive(self, additional=0):
 412         n = self.entries + additional
 413         m = 8*2**self.bits
 414         k = self.k
 415         return 100*(1-math.exp(-k*float(n)/m))**k
 416
 417     def add_idx(self, ix):
 418         """Add the object to the filter, return current pfalse_positive."""
 419         if not self.map: raise Exception, "Cannot add to closed bloom"
 420         self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
 421         self.idxnames.append(os.path.basename(ix.name))
 422
 423     def exists(self, sha):
 424         """Return nonempty if the object probably exists in the bloom filter."""
 425         global _total_searches, _total_steps
 426         _total_searches += 1
 427         if not self.map: return None
 428         found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
 429         _total_steps += steps
 430         return found
 431
 432     @classmethod
 433     def create(cls, name, f=None, readwrite=False, expected=100000, k=None):
 434         """Create and return a bloom filter for `expected` entries."""
 435         bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
 436         k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
 437         if bits > MAX_BLOOM_BITS[k]:
 438             log('bloom: warning, max bits exceeded, non-optimal\n')
 439             bits = MAX_BLOOM_BITS[k]
 440         debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
 441         f = f or open(name, 'w+b')
 442         f.write('BLOM')
 443         f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
 444         assert(f.tell() == 16)
 445         f.write('\0'*2**bits)
 446         f.seek(0)
 447         return cls(name, f=f, readwrite=readwrite)
 448
 449     def __len__(self):
 450         return self.entries
 451
 452
 453 class PackMidx:
 454     """Wrapper which contains data from multiple index files.
 455     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 456     and make it possible for bup to expand Git's indexing capabilities to vast
 457     amounts of files.
 458     """
 459     def __init__(self, filename):
 460         self.name = filename
 461         self.force_keep = False
 462         assert(filename.endswith('.midx'))
 463         self.map = mmap_read(open(filename))
 464         if str(self.map[0:4]) != 'MIDX':
 465             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 466             self.force_keep = True
 467             return self._init_failed()
 468         ver = struct.unpack('!I', self.map[4:8])[0]
 469         if ver < MIDX_VERSION:
 470             log('Warning: ignoring old-style (v%d) midx %r\n'
 471                 % (ver, filename))
 472             self.force_keep = False  # old stuff is boring
 473             return self._init_failed()
 474         if ver > MIDX_VERSION:
 475             log('Warning: ignoring too-new (v%d) midx %r\n'
 476                 % (ver, filename))
 477             self.force_keep = True  # new stuff is exciting
 478             return self._init_failed()
 479
 480         self.bits = _helpers.firstword(self.map[8:12])
 481         self.entries = 2**self.bits
 482         self.fanout = buffer(self.map, 12, self.entries*4)
 483         shaofs = 12 + self.entries*4
 484         self.nsha = nsha = self._fanget(self.entries-1)
 485         self.shatable = buffer(self.map, shaofs, nsha*20)
 486         self.whichlist = buffer(self.map, shaofs + nsha*20, nsha*4)
 487         self.idxnames = str(self.map[shaofs + 24*nsha:]).split('\0')
 488
 489     def _init_failed(self):
 490         self.bits = 0
 491         self.entries = 1
 492         self.fanout = buffer('\0\0\0\0')
 493         self.shatable = buffer('\0'*20)
 494         self.idxnames = []
 495
 496     def _fanget(self, i):
 497         start = i*4
 498         s = self.fanout[start:start+4]
 499         return _helpers.firstword(s)
 500
 501     def _get(self, i):
 502         return str(self.shatable[i*20:(i+1)*20])
 503
 504     def _get_idx_i(self, i):
 505         return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
 506
 507     def _get_idxname(self, i):
 508         return self.idxnames[self._get_idx_i(i)]
 509
 510     def exists(self, hash, want_source=False):
 511         """Return nonempty if the object exists in the index files."""
 512         global _total_searches, _total_steps
 513         _total_searches += 1
 514         want = str(hash)
 515         el = extract_bits(want, self.bits)
 516         if el:
 517             start = self._fanget(el-1)
 518             startv = el << (32-self.bits)
 519         else:
 520             start = 0
 521             startv = 0
 522         end = self._fanget(el)
 523         endv = (el+1) << (32-self.bits)
 524         _total_steps += 1   # lookup table is a step
 525         hashv = _helpers.firstword(hash)
 526         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 527         while start < end:
 528             _total_steps += 1
 529             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 530             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 531             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 532             v = self._get(mid)
 533             #print '    %08x' % self._num(v)
 534             if v < want:
 535                 start = mid+1
 536                 startv = _helpers.firstword(v)
 537             elif v > want:
 538                 end = mid
 539                 endv = _helpers.firstword(v)
 540             else: # got it!
 541                 return want_source and self._get_idxname(mid) or True
 542         return None
 543
 544     def iter_with_idx_i(self, ofs):
 545         for i in xrange(self._fanget(self.entries-1)):
 546             yield buffer(self.shatable, i*20, 20), ofs+self._get_idx_i(i)
 547
 548     def __iter__(self):
 549         for i in xrange(self._fanget(self.entries-1)):
 550             yield buffer(self.shatable, i*20, 20)
 551
 552     def __len__(self):
 553         return int(self._fanget(self.entries-1))
 554
 555
 556 _mpi_count = 0
 557 class PackIdxList:
 558     def __init__(self, dir):
 559         global _mpi_count
 560         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 561         _mpi_count += 1
 562         self.dir = dir
 563         self.also = set()
 564         self.packs = []
 565         self.do_bloom = False
 566         self.bloom = None
 567         self.refresh()
 568
 569     def __del__(self):
 570         global _mpi_count
 571         _mpi_count -= 1
 572         assert(_mpi_count == 0)
 573
 574     def __iter__(self):
 575         return iter(idxmerge(self.packs))
 576
 577     def __len__(self):
 578         return sum(len(pack) for pack in self.packs)
 579
 580     def exists(self, hash, want_source=False):
 581         """Return nonempty if the object exists in the index files."""
 582         global _total_searches
 583         _total_searches += 1
 584         if hash in self.also:
 585             return True
 586         if self.do_bloom and self.bloom is not None:
 587             _total_searches -= 1  # will be incremented by bloom
 588             if self.bloom.exists(hash):
 589                 self.do_bloom = False
 590             else:
 591                 return None
 592         for i in xrange(len(self.packs)):
 593             p = self.packs[i]
 594             _total_searches -= 1  # will be incremented by sub-pack
 595             ix = p.exists(hash, want_source=want_source)
 596             if ix:
 597                 # reorder so most recently used packs are searched first
 598                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 599                 return ix
 600         self.do_bloom = True
 601         return None
 602
 603     def refresh(self, skip_midx = False):
 604         """Refresh the index list.
 605         This method verifies if .midx files were superseded (e.g. all of its
 606         contents are in another, bigger .midx file) and removes the superseded
 607         files.
 608
 609         If skip_midx is True, all work on .midx files will be skipped and .midx
 610         files will be removed from the list.
 611
 612         The module-global variable 'ignore_midx' can force this function to
 613         always act as if skip_midx was True.
 614         """
 615         self.bloom = None # Always reopen the bloom as it may have been relaced
 616         self.do_bloom = False
 617         skip_midx = skip_midx or ignore_midx
 618         d = dict((p.name, p) for p in self.packs
 619                  if not skip_midx or not isinstance(p, PackMidx))
 620         if os.path.exists(self.dir):
 621             if not skip_midx:
 622                 midxl = []
 623                 for ix in self.packs:
 624                     if isinstance(ix, PackMidx):
 625                         for name in ix.idxnames:
 626                             d[os.path.join(self.dir, name)] = ix
 627                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 628                     if not d.get(full):
 629                         mx = PackMidx(full)
 630                         (mxd, mxf) = os.path.split(mx.name)
 631                         broken = False
 632                         for n in mx.idxnames:
 633                             if not os.path.exists(os.path.join(mxd, n)):
 634                                 log(('warning: index %s missing\n' +
 635                                     '  used by %s\n') % (n, mxf))
 636                                 broken = True
 637                         if broken:
 638                             del mx
 639                             unlink(full)
 640                         else:
 641                             midxl.append(mx)
 642                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 643                 for ix in midxl:
 644                     any_needed = False
 645                     for sub in ix.idxnames:
 646                         found = d.get(os.path.join(self.dir, sub))
 647                         if not found or isinstance(found, PackIdx):
 648                             # doesn't exist, or exists but not in a midx
 649                             any_needed = True
 650                             break
 651                     if any_needed:
 652                         d[ix.name] = ix
 653                         for name in ix.idxnames:
 654                             d[os.path.join(self.dir, name)] = ix
 655                     elif not ix.force_keep:
 656                         debug1('midx: removing redundant: %s\n'
 657                                % os.path.basename(ix.name))
 658                         unlink(ix.name)
 659             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 660                 if not d.get(full):
 661                     try:
 662                         ix = open_idx(full)
 663                     except GitError, e:
 664                         add_error(e)
 665                         continue
 666                     d[full] = ix
 667             bfull = os.path.join(self.dir, 'bup.bloom')
 668             if self.bloom is None and os.path.exists(bfull):
 669                 self.bloom = ShaBloom(bfull)
 670             self.packs = list(set(d.values()))
 671             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 672             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 673                 self.do_bloom = True
 674             else:
 675                 self.bloom = None
 676         debug1('PackIdxList: using %d index%s.\n'
 677             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 678
 679     def add(self, hash):
 680         """Insert an additional object in the list."""
 681         self.also.add(hash)
 682
 683
 684 def calc_hash(type, content):
 685     """Calculate some content's hash in the Git fashion."""
 686     header = '%s %d\0' % (type, len(content))
 687     sum = Sha1(header)
 688     sum.update(content)
 689     return sum.digest()
 690
 691
 692 def _shalist_sort_key(ent):
 693     (mode, name, id) = ent
 694     if stat.S_ISDIR(int(mode, 8)):
 695         return name + '/'
 696     else:
 697         return name
 698
 699
 700 def open_idx(filename):
 701     if filename.endswith('.idx'):
 702         f = open(filename, 'rb')
 703         header = f.read(8)
 704         if header[0:4] == '\377tOc':
 705             version = struct.unpack('!I', header[4:8])[0]
 706             if version == 2:
 707                 return PackIdxV2(filename, f)
 708             else:
 709                 raise GitError('%s: expected idx file version 2, got %d'
 710                                % (filename, version))
 711         elif len(header) == 8 and header[0:4] < '\377tOc':
 712             return PackIdxV1(filename, f)
 713         else:
 714             raise GitError('%s: unrecognized idx file header' % filename)
 715     elif filename.endswith('.midx'):
 716         return PackMidx(filename)
 717     else:
 718         raise GitError('idx filenames must end with .idx or .midx')
 719
 720
 721 def idxmerge(idxlist, final_progress=True, total=None):
 722     """Generate a list of all the objects reachable in a PackIdxList."""
 723     def pfunc(count, total):
 724         progress('Reading indexes: %.2f%% (%d/%d)\r'
 725                  % (count*100.0/total, count, total))
 726     def pfinal(count, total):
 727         if final_progress:
 728             log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 729     return merge_iter(idxlist, 10024, pfunc, pfinal, total=total)
 730
 731
 732 def _make_objcache():
 733     return PackIdxList(repo('objects/pack'))
 734
 735 class PackWriter:
 736     """Writes Git objects insid a pack file."""
 737     def __init__(self, objcache_maker=_make_objcache):
 738         self.count = 0
 739         self.outbytes = 0
 740         self.filename = None
 741         self.file = None
 742         self.idx = None
 743         self.objcache_maker = objcache_maker
 744         self.objcache = None
 745
 746     def __del__(self):
 747         self.close()
 748
 749     def _open(self):
 750         if not self.file:
 751             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 752             self.file = os.fdopen(fd, 'w+b')
 753             assert(name.endswith('.pack'))
 754             self.filename = name[:-5]
 755             self.file.write('PACK\0\0\0\2\0\0\0\0')
 756             self.idx = list(list() for i in xrange(256))
 757
 758     # the 'sha' parameter is used in client.py's _raw_write(), but not needed
 759     # in this basic version.
 760     def _raw_write(self, datalist, sha):
 761         self._open()
 762         f = self.file
 763         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 764         # the file never has a *partial* blob.  So let's make sure it's
 765         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 766         # to our hashsplit algorithm.)  f.write() does its own buffering,
 767         # but that's okay because we'll flush it in _end().
 768         oneblob = ''.join(datalist)
 769         try:
 770             f.write(oneblob)
 771         except IOError, e:
 772             raise GitError, e, sys.exc_info()[2]
 773         nw = len(oneblob)
 774         crc = zlib.crc32(oneblob) & 0xffffffff
 775         self._update_idx(sha, crc, nw)
 776         self.outbytes += nw
 777         self.count += 1
 778         return nw, crc
 779
 780     def _update_idx(self, sha, crc, size):
 781         assert(sha)
 782         if self.idx:
 783             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 784
 785     def _write(self, sha, type, content):
 786         if verbose:
 787             log('>')
 788         if not sha:
 789             sha = calc_hash(type, content)
 790         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 791         return sha
 792
 793     def breakpoint(self):
 794         """Clear byte and object counts and return the last processed id."""
 795         id = self._end()
 796         self.outbytes = self.count = 0
 797         return id
 798
 799     def _require_objcache(self):
 800         if self.objcache is None and self.objcache_maker:
 801             self.objcache = self.objcache_maker()
 802         if self.objcache is None:
 803             raise GitError(
 804                     "PackWriter not opened or can't check exists w/o objcache")
 805
 806     def exists(self, id, want_source=False):
 807         """Return non-empty if an object is found in the object cache."""
 808         self._require_objcache()
 809         return self.objcache.exists(id, want_source=want_source)
 810
 811     def maybe_write(self, type, content):
 812         """Write an object to the pack file if not present and return its id."""
 813         self._require_objcache()
 814         sha = calc_hash(type, content)
 815         if not self.exists(sha):
 816             self._write(sha, type, content)
 817             self.objcache.add(sha)
 818         return sha
 819
 820     def new_blob(self, blob):
 821         """Create a blob object in the pack with the supplied content."""
 822         return self.maybe_write('blob', blob)
 823
 824     def new_tree(self, shalist):
 825         """Create a tree object in the pack."""
 826         shalist = sorted(shalist, key = _shalist_sort_key)
 827         l = []
 828         for (mode,name,bin) in shalist:
 829             assert(mode)
 830             assert(mode != '0')
 831             assert(mode[0] != '0')
 832             assert(name)
 833             assert(len(bin) == 20)
 834             l.append('%s %s\0%s' % (mode,name,bin))
 835         return self.maybe_write('tree', ''.join(l))
 836
 837     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 838         l = []
 839         if tree: l.append('tree %s' % tree.encode('hex'))
 840         if parent: l.append('parent %s' % parent.encode('hex'))
 841         if author: l.append('author %s %s' % (author, _git_date(adate)))
 842         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 843         l.append('')
 844         l.append(msg)
 845         return self.maybe_write('commit', '\n'.join(l))
 846
 847     def new_commit(self, parent, tree, date, msg):
 848         """Create a commit object in the pack."""
 849         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 850         commit = self._new_commit(tree, parent,
 851                                   userline, date, userline, date,
 852                                   msg)
 853         return commit
 854
 855     def abort(self):
 856         """Remove the pack file from disk."""
 857         f = self.file
 858         if f:
 859             self.idx = None
 860             self.file = None
 861             f.close()
 862             os.unlink(self.filename + '.pack')
 863
 864     def _end(self, run_midx=True):
 865         f = self.file
 866         if not f: return None
 867         self.file = None
 868         self.objcache = None
 869         idx = self.idx
 870         self.idx = None
 871
 872         # update object count
 873         f.seek(8)
 874         cp = struct.pack('!i', self.count)
 875         assert(len(cp) == 4)
 876         f.write(cp)
 877
 878         # calculate the pack sha1sum
 879         f.seek(0)
 880         sum = Sha1()
 881         for b in chunkyreader(f):
 882             sum.update(b)
 883         packbin = sum.digest()
 884         f.write(packbin)
 885         f.close()
 886
 887         idx_f = open(self.filename + '.idx', 'wb')
 888         obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
 889         idx_f.close()
 890
 891         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 892         if os.path.exists(self.filename + '.map'):
 893             os.unlink(self.filename + '.map')
 894         os.rename(self.filename + '.pack', nameprefix + '.pack')
 895         os.rename(self.filename + '.idx', nameprefix + '.idx')
 896
 897         if run_midx:
 898             auto_midx(repo('objects/pack'))
 899         return nameprefix
 900
 901     def close(self, run_midx=True):
 902         """Close the pack file and move it to its definitive path."""
 903         return self._end(run_midx=run_midx)
 904
 905     def _write_pack_idx_v2(self, file, idx, packbin):
 906         sum = Sha1()
 907
 908         def write(data):
 909             file.write(data)
 910             sum.update(data)
 911
 912         write('\377tOc\0\0\0\2')
 913
 914         n = 0
 915         for part in idx:
 916             n += len(part)
 917             write(struct.pack('!i', n))
 918             part.sort(key=lambda x: x[0])
 919
 920         obj_list_sum = Sha1()
 921         for part in idx:
 922             for entry in part:
 923                 write(entry[0])
 924                 obj_list_sum.update(entry[0])
 925         for part in idx:
 926             for entry in part:
 927                 write(struct.pack('!I', entry[1]))
 928         ofs64_list = []
 929         for part in idx:
 930             for entry in part:
 931                 if entry[2] & 0x80000000:
 932                     write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
 933                     ofs64_list.append(struct.pack('!Q', entry[2]))
 934                 else:
 935                     write(struct.pack('!i', entry[2]))
 936         for ofs64 in ofs64_list:
 937             write(ofs64)
 938
 939         write(packbin)
 940         file.write(sum.digest())
 941         return obj_list_sum.hexdigest()
 942
 943
 944 def _git_date(date):
 945     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 946
 947
 948 def _gitenv():
 949     os.environ['GIT_DIR'] = os.path.abspath(repo())
 950
 951
 952 def list_refs(refname = None):
 953     """Generate a list of tuples in the form (refname,hash).
 954     If a ref name is specified, list only this particular ref.
 955     """
 956     argv = ['git', 'show-ref', '--']
 957     if refname:
 958         argv += [refname]
 959     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 960     out = p.stdout.read().strip()
 961     rv = p.wait()  # not fatal
 962     if rv:
 963         assert(not out)
 964     if out:
 965         for d in out.split('\n'):
 966             (sha, name) = d.split(' ', 1)
 967             yield (name, sha.decode('hex'))
 968
 969
 970 def read_ref(refname):
 971     """Get the commit id of the most recent commit made on a given ref."""
 972     l = list(list_refs(refname))
 973     if l:
 974         assert(len(l) == 1)
 975         return l[0][1]
 976     else:
 977         return None
 978
 979
 980 def rev_list(ref, count=None):
 981     """Generate a list of reachable commits in reverse chronological order.
 982
 983     This generator walks through commits, from child to parent, that are
 984     reachable via the specified ref and yields a series of tuples of the form
 985     (date,hash).
 986
 987     If count is a non-zero integer, limit the number of commits to "count"
 988     objects.
 989     """
 990     assert(not ref.startswith('-'))
 991     opts = []
 992     if count:
 993         opts += ['-n', str(atoi(count))]
 994     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 995     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 996     commit = None
 997     for row in p.stdout:
 998         s = row.strip()
 999         if s.startswith('commit '):
1000             commit = s[7:].decode('hex')
1001         else:
1002             date = int(s)
1003             yield (date, commit)
1004     rv = p.wait()  # not fatal
1005     if rv:
1006         raise GitError, 'git rev-list returned error %d' % rv
1007
1008
1009 def rev_get_date(ref):
1010     """Get the date of the latest commit on the specified ref."""
1011     for (date, commit) in rev_list(ref, count=1):
1012         return date
1013     raise GitError, 'no such commit %r' % ref
1014
1015
1016 def rev_parse(committish):
1017     """Resolve the full hash for 'committish', if it exists.
1018
1019     Should be roughly equivalent to 'git rev-parse'.
1020
1021     Returns the hex value of the hash if it is found, None if 'committish' does
1022     not correspond to anything.
1023     """
1024     head = read_ref(committish)
1025     if head:
1026         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1027         return head
1028
1029     pL = PackIdxList(repo('objects/pack'))
1030
1031     if len(committish) == 40:
1032         try:
1033             hash = committish.decode('hex')
1034         except TypeError:
1035             return None
1036
1037         if pL.exists(hash):
1038             return hash
1039
1040     return None
1041
1042
1043 def update_ref(refname, newval, oldval):
1044     """Change the commit pointed to by a branch."""
1045     if not oldval:
1046         oldval = ''
1047     assert(refname.startswith('refs/heads/'))
1048     p = subprocess.Popen(['git', 'update-ref', refname,
1049                           newval.encode('hex'), oldval.encode('hex')],
1050                          preexec_fn = _gitenv)
1051     _git_wait('git update-ref', p)
1052
1053
1054 def guess_repo(path=None):
1055     """Set the path value in the global variable "repodir".
1056     This makes bup look for an existing bup repository, but not fail if a
1057     repository doesn't exist. Usually, if you are interacting with a bup
1058     repository, you would not be calling this function but using
1059     check_repo_or_die().
1060     """
1061     global repodir
1062     if path:
1063         repodir = path
1064     if not repodir:
1065         repodir = os.environ.get('BUP_DIR')
1066         if not repodir:
1067             repodir = os.path.expanduser('~/.bup')
1068
1069
1070 def init_repo(path=None):
1071     """Create the Git bare repository for bup in a given path."""
1072     guess_repo(path)
1073     d = repo()  # appends a / to the path
1074     parent = os.path.dirname(os.path.dirname(d))
1075     if parent and not os.path.exists(parent):
1076         raise GitError('parent directory "%s" does not exist\n' % parent)
1077     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1078         raise GitError('"%d" exists but is not a directory\n' % d)
1079     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1080                          preexec_fn = _gitenv)
1081     _git_wait('git init', p)
1082     # Force the index version configuration in order to ensure bup works
1083     # regardless of the version of the installed Git binary.
1084     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1085                          stdout=sys.stderr, preexec_fn = _gitenv)
1086     _git_wait('git config', p)
1087
1088
1089 def check_repo_or_die(path=None):
1090     """Make sure a bup repository exists, and abort if not.
1091     If the path to a particular repository was not specified, this function
1092     initializes the default repository automatically.
1093     """
1094     guess_repo(path)
1095     if not os.path.isdir(repo('objects/pack/.')):
1096         if repodir == home_repodir:
1097             init_repo()
1098         else:
1099             log('error: %r is not a bup/git repository\n' % repo())
1100             sys.exit(15)
1101
1102
1103 def treeparse(buf):
1104     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1105     ofs = 0
1106     while ofs < len(buf):
1107         z = buf[ofs:].find('\0')
1108         assert(z > 0)
1109         spl = buf[ofs:ofs+z].split(' ', 1)
1110         assert(len(spl) == 2)
1111         sha = buf[ofs+z+1:ofs+z+1+20]
1112         ofs += z+1+20
1113         yield (spl[0], spl[1], sha)
1114
1115
1116 _ver = None
1117 def ver():
1118     """Get Git's version and ensure a usable version is installed.
1119
1120     The returned version is formatted as an ordered tuple with each position
1121     representing a digit in the version tag. For example, the following tuple
1122     would represent version 1.6.6.9:
1123
1124         ('1', '6', '6', '9')
1125     """
1126     global _ver
1127     if not _ver:
1128         p = subprocess.Popen(['git', '--version'],
1129                              stdout=subprocess.PIPE)
1130         gvs = p.stdout.read()
1131         _git_wait('git --version', p)
1132         m = re.match(r'git version (\S+.\S+)', gvs)
1133         if not m:
1134             raise GitError('git --version weird output: %r' % gvs)
1135         _ver = tuple(m.group(1).split('.'))
1136     needed = ('1','5', '3', '1')
1137     if _ver < needed:
1138         raise GitError('git version %s or higher is required; you have %s'
1139                        % ('.'.join(needed), '.'.join(_ver)))
1140     return _ver
1141
1142
1143 def _git_wait(cmd, p):
1144     rv = p.wait()
1145     if rv != 0:
1146         raise GitError('%s returned %d' % (cmd, rv))
1147
1148
1149 def _git_capture(argv):
1150     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1151     r = p.stdout.read()
1152     _git_wait(repr(argv), p)
1153     return r
1154
1155
1156 class _AbortableIter:
1157     def __init__(self, it, onabort = None):
1158         self.it = it
1159         self.onabort = onabort
1160         self.done = None
1161
1162     def __iter__(self):
1163         return self
1164
1165     def next(self):
1166         try:
1167             return self.it.next()
1168         except StopIteration, e:
1169             self.done = True
1170             raise
1171         except:
1172             self.abort()
1173             raise
1174
1175     def abort(self):
1176         """Abort iteration and call the abortion callback, if needed."""
1177         if not self.done:
1178             self.done = True
1179             if self.onabort:
1180                 self.onabort()
1181
1182     def __del__(self):
1183         self.abort()
1184
1185
1186 _ver_warned = 0
1187 class CatPipe:
1188     """Link to 'git cat-file' that is used to retrieve blob data."""
1189     def __init__(self):
1190         global _ver_warned
1191         wanted = ('1','5','6')
1192         if ver() < wanted:
1193             if not _ver_warned:
1194                 log('warning: git version < %s; bup will be slow.\n'
1195                     % '.'.join(wanted))
1196                 _ver_warned = 1
1197             self.get = self._slow_get
1198         else:
1199             self.p = self.inprogress = None
1200             self.get = self._fast_get
1201
1202     def _abort(self):
1203         if self.p:
1204             self.p.stdout.close()
1205             self.p.stdin.close()
1206         self.p = None
1207         self.inprogress = None
1208
1209     def _restart(self):
1210         self._abort()
1211         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1212                                   stdin=subprocess.PIPE,
1213                                   stdout=subprocess.PIPE,
1214                                   close_fds = True,
1215                                   bufsize = 4096,
1216                                   preexec_fn = _gitenv)
1217
1218     def _fast_get(self, id):
1219         if not self.p or self.p.poll() != None:
1220             self._restart()
1221         assert(self.p)
1222         assert(self.p.poll() == None)
1223         if self.inprogress:
1224             log('_fast_get: opening %r while %r is open'
1225                 % (id, self.inprogress))
1226         assert(not self.inprogress)
1227         assert(id.find('\n') < 0)
1228         assert(id.find('\r') < 0)
1229         assert(not id.startswith('-'))
1230         self.inprogress = id
1231         self.p.stdin.write('%s\n' % id)
1232         self.p.stdin.flush()
1233         hdr = self.p.stdout.readline()
1234         if hdr.endswith(' missing\n'):
1235             self.inprogress = None
1236             raise KeyError('blob %r is missing' % id)
1237         spl = hdr.split(' ')
1238         if len(spl) != 3 or len(spl[0]) != 40:
1239             raise GitError('expected blob, got %r' % spl)
1240         (hex, type, size) = spl
1241
1242         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1243                            onabort = self._abort)
1244         try:
1245             yield type
1246             for blob in it:
1247                 yield blob
1248             assert(self.p.stdout.readline() == '\n')
1249             self.inprogress = None
1250         except Exception, e:
1251             it.abort()
1252             raise
1253
1254     def _slow_get(self, id):
1255         assert(id.find('\n') < 0)
1256         assert(id.find('\r') < 0)
1257         assert(id[0] != '-')
1258         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1259         yield type
1260
1261         p = subprocess.Popen(['git', 'cat-file', type, id],
1262                              stdout=subprocess.PIPE,
1263                              preexec_fn = _gitenv)
1264         for blob in chunkyreader(p.stdout):
1265             yield blob
1266         _git_wait('git cat-file', p)
1267
1268     def _join(self, it):
1269         type = it.next()
1270         if type == 'blob':
1271             for blob in it:
1272                 yield blob
1273         elif type == 'tree':
1274             treefile = ''.join(it)
1275             for (mode, name, sha) in treeparse(treefile):
1276                 for blob in self.join(sha.encode('hex')):
1277                     yield blob
1278         elif type == 'commit':
1279             treeline = ''.join(it).split('\n')[0]
1280             assert(treeline.startswith('tree '))
1281             for blob in self.join(treeline[5:]):
1282                 yield blob
1283         else:
1284             raise GitError('invalid object type %r: expected blob/tree/commit'
1285                            % type)
1286
1287     def join(self, id):
1288         """Generate a list of the content of all blobs that can be reached
1289         from an object.  The hash given in 'id' must point to a blob, a tree
1290         or a commit. The content of all blobs that can be seen from trees or
1291         commits will be added to the list.
1292         """
1293         try:
1294             for d in self._join(self.get(id)):
1295                 yield d
1296         except StopIteration:
1297             log('booger!\n')
1298
1299 def tags():
1300     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1301     tags = {}
1302     for (n,c) in list_refs():
1303         if n.startswith('refs/tags/'):
1304             name = n[10:]
1305             if not c in tags:
1306                 tags[c] = []
1307
1308             tags[c].append(name)  # more than one tag can point at 'c'
1309
1310     return tags