lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path
   8
   9 MIDX_VERSION = 4
  10
  11 """Discussion of bloom constants for bup:
  12
  13 There are four basic things to consider when building a bloom filter:
  14 The size, in bits, of the filter
  15 The capacity, in entries, of the filter
  16 The probability of a false positive that is tolerable
  17 The number of bits readily available to use for addresing filter bits
  18
  19 There is one major tunable that is not directly related to the above:
  20 k: the number of bits set in the filter per entry
  21
  22 Here's a wall of numbers showing the relationship between k; the ratio between
  23 the filter size in bits and the entries in the filter; and pfalse_positive:
  24
  25 mn|k=3    |k=4    |k=5    |k=6    |k=7    |k=8    |k=9    |k=10   |k=11
  26  8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
  27  9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
  28 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
  29 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
  30 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
  31 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
  32 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
  33 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
  34 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
  35 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
  36 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
  37 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
  38 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
  39 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
  40 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
  41 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
  42 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
  43 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
  44 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
  45 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
  46 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
  47 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
  48 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
  49 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
  50 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
  51
  52 Here's a table showing available repository size for a given pfalse_positive
  53 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
  54 filter and 8192bytes per object):
  55
  56 pfalse|obj k=4     |cap k=4    |obj k=5  |cap k=5    |obj k=6 |cap k=6
  57 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
  58 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
  59 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
  60
  61 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
  62 addressing.
  63
  64 filter size scales linearly with repository size for a given k and pfalse.
  65
  66 Here's a table of filter sizes for a 1 TiB repository:
  67
  68 pfalse| k=3        | k=4        | k=5        | k=6
  69 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
  70 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
  71 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
  72
  73 For bup:
  74 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
  75 per lookup will be worse than the two required for midx.
  76 * We want the pfalse_positive to be low enough that the cost of sometimes
  77 faulting on the midx doesn't overcome the benefit of the bloom filter.
  78 * We have readily available 160 bits for addressing the filter.
  79 * We want to be able to have a single bloom address entire repositories of
  80 reasonable size.
  81
  82 Based on these parameters, a combination of k=4 and k=5 provides the behavior
  83 that bup needs.  As such, I've implemented bloom addressing, adding and
  84 checking functions in C for these two values.  Because k=5 requires less space
  85 and gives better overall pfalse_positive perofrmance, it is preferred if a
  86 table with k=5 can represent the repository.
  87
  88 None of this tells us what max_pfalse_positive to choose.
  89
  90 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
  91 """
  92 BLOOM_VERSION = 2
  93 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
  94 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
  95 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
  96
  97 verbose = 0
  98 ignore_midx = 0
  99 home_repodir = os.path.expanduser('~/.bup')
 100 repodir = None
 101
 102 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
 103 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
 104
 105 _total_searches = 0
 106 _total_steps = 0
 107
 108
 109 class GitError(Exception):
 110     pass
 111
 112
 113 def repo(sub = ''):
 114     """Get the path to the git repository or one of its subdirectories."""
 115     global repodir
 116     if not repodir:
 117         raise GitError('You should call check_repo_or_die()')
 118
 119     # If there's a .git subdirectory, then the actual repo is in there.
 120     gd = os.path.join(repodir, '.git')
 121     if os.path.exists(gd):
 122         repodir = gd
 123
 124     return os.path.join(repodir, sub)
 125
 126
 127 def auto_midx(objdir):
 128     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 129     try:
 130         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 131     except OSError, e:
 132         # make sure 'args' gets printed to help with debugging
 133         add_error('%r: exception: %s' % (args, e))
 134         raise
 135     if rv:
 136         add_error('%r: returned %d' % (args, rv))
 137
 138     args = [path.exe(), 'bloom', '--dir', objdir]
 139     try:
 140         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 141     except OSError, e:
 142         # make sure 'args' gets printed to help with debugging
 143         add_error('%r: exception: %s' % (args, e))
 144         raise
 145     if rv:
 146         add_error('%r: returned %d' % (args, rv))
 147
 148
 149 def mangle_name(name, mode, gitmode):
 150     """Mangle a file name to present an abstract name for segmented files.
 151     Mangled file names will have the ".bup" extension added to them. If a
 152     file's name already ends with ".bup", a ".bupl" extension is added to
 153     disambiguate normal files from semgmented ones.
 154     """
 155     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 156         return name + '.bup'
 157     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 158         return name + '.bupl'
 159     else:
 160         return name
 161
 162
 163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 164 def demangle_name(name):
 165     """Remove name mangling from a file name, if necessary.
 166
 167     The return value is a tuple (demangled_filename,mode), where mode is one of
 168     the following:
 169
 170     * BUP_NORMAL  : files that should be read as-is from the repository
 171     * BUP_CHUNKED : files that were chunked and need to be assembled
 172
 173     For more information on the name mangling algorythm, see mangle_name()
 174     """
 175     if name.endswith('.bupl'):
 176         return (name[:-5], BUP_NORMAL)
 177     elif name.endswith('.bup'):
 178         return (name[:-4], BUP_CHUNKED)
 179     else:
 180         return (name, BUP_NORMAL)
 181
 182
 183 def _encode_packobj(type, content):
 184     szout = ''
 185     sz = len(content)
 186     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 187     sz >>= 4
 188     while 1:
 189         if sz: szbits |= 0x80
 190         szout += chr(szbits)
 191         if not sz:
 192             break
 193         szbits = sz & 0x7f
 194         sz >>= 7
 195     z = zlib.compressobj(1)
 196     yield szout
 197     yield z.compress(content)
 198     yield z.flush()
 199
 200
 201 def _encode_looseobj(type, content):
 202     z = zlib.compressobj(1)
 203     yield z.compress('%s %d\0' % (type, len(content)))
 204     yield z.compress(content)
 205     yield z.flush()
 206
 207
 208 def _decode_looseobj(buf):
 209     assert(buf);
 210     s = zlib.decompress(buf)
 211     i = s.find('\0')
 212     assert(i > 0)
 213     l = s[:i].split(' ')
 214     type = l[0]
 215     sz = int(l[1])
 216     content = s[i+1:]
 217     assert(type in _typemap)
 218     assert(sz == len(content))
 219     return (type, content)
 220
 221
 222 def _decode_packobj(buf):
 223     assert(buf)
 224     c = ord(buf[0])
 225     type = _typermap[(c & 0x70) >> 4]
 226     sz = c & 0x0f
 227     shift = 4
 228     i = 0
 229     while c & 0x80:
 230         i += 1
 231         c = ord(buf[i])
 232         sz |= (c & 0x7f) << shift
 233         shift += 7
 234         if not (c & 0x80):
 235             break
 236     return (type, zlib.decompress(buf[i+1:]))
 237
 238
 239 class PackIdx:
 240     def __init__(self):
 241         assert(0)
 242
 243     def find_offset(self, hash):
 244         """Get the offset of an object inside the index file."""
 245         idx = self._idx_from_hash(hash)
 246         if idx != None:
 247             return self._ofs_from_idx(idx)
 248         return None
 249
 250     def exists(self, hash, want_source=False):
 251         """Return nonempty if the object exists in this index."""
 252         if hash and (self._idx_from_hash(hash) != None):
 253             return want_source and self.name or True
 254         return None
 255
 256     def __len__(self):
 257         return int(self.fanout[255])
 258
 259     def _idx_from_hash(self, hash):
 260         global _total_searches, _total_steps
 261         _total_searches += 1
 262         assert(len(hash) == 20)
 263         b1 = ord(hash[0])
 264         start = self.fanout[b1-1] # range -1..254
 265         end = self.fanout[b1] # range 0..255
 266         want = str(hash)
 267         _total_steps += 1  # lookup table is a step
 268         while start < end:
 269             _total_steps += 1
 270             mid = start + (end-start)/2
 271             v = self._idx_to_hash(mid)
 272             if v < want:
 273                 start = mid+1
 274             elif v > want:
 275                 end = mid
 276             else: # got it!
 277                 return mid
 278         return None
 279
 280
 281 class PackIdxV1(PackIdx):
 282     """Object representation of a Git pack index (version 1) file."""
 283     def __init__(self, filename, f):
 284         self.name = filename
 285         self.idxnames = [self.name]
 286         self.map = mmap_read(f)
 287         self.fanout = list(struct.unpack('!256I',
 288                                          str(buffer(self.map, 0, 256*4))))
 289         self.fanout.append(0)  # entry "-1"
 290         nsha = self.fanout[255]
 291         self.sha_ofs = 256*4
 292         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 293
 294     def _ofs_from_idx(self, idx):
 295         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 296
 297     def _idx_to_hash(self, idx):
 298         return str(self.shatable[idx*24+4 : idx*24+24])
 299
 300     def __iter__(self):
 301         for i in xrange(self.fanout[255]):
 302             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 303
 304
 305 class PackIdxV2(PackIdx):
 306     """Object representation of a Git pack index (version 2) file."""
 307     def __init__(self, filename, f):
 308         self.name = filename
 309         self.idxnames = [self.name]
 310         self.map = mmap_read(f)
 311         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 312         self.fanout = list(struct.unpack('!256I',
 313                                          str(buffer(self.map, 8, 256*4))))
 314         self.fanout.append(0)  # entry "-1"
 315         nsha = self.fanout[255]
 316         self.sha_ofs = 8 + 256*4
 317         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 318         self.ofstable = buffer(self.map,
 319                                self.sha_ofs + nsha*20 + nsha*4,
 320                                nsha*4)
 321         self.ofs64table = buffer(self.map,
 322                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 323
 324     def _ofs_from_idx(self, idx):
 325         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 326         if ofs & 0x80000000:
 327             idx64 = ofs & 0x7fffffff
 328             ofs = struct.unpack('!Q',
 329                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 330         return ofs
 331
 332     def _idx_to_hash(self, idx):
 333         return str(self.shatable[idx*20:(idx+1)*20])
 334
 335     def __iter__(self):
 336         for i in xrange(self.fanout[255]):
 337             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 338
 339
 340 extract_bits = _helpers.extract_bits
 341
 342 bloom_contains = _helpers.bloom_contains
 343 bloom_add = _helpers.bloom_add
 344
 345
 346 class ShaBloom:
 347     """Wrapper which contains data from multiple index files.
 348     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 349     and make it possible for bup to expand Git's indexing capabilities to vast
 350     amounts of files.
 351     """
 352     def __init__(self, filename, f=None, readwrite=False):
 353         self.name = filename
 354         self.rwfile = None
 355         self.map = None
 356         assert(filename.endswith('.bloom'))
 357         if readwrite:
 358             self.rwfile = f or open(filename, 'r+b')
 359             self.map = mmap_readwrite(self.rwfile, close=False)
 360         else:
 361             self.rwfile = None
 362             self.map = mmap_read(f or open(filename, 'rb'))
 363         if str(self.map[0:4]) != 'BLOM':
 364             log('Warning: skipping: invalid BLOM header in %r\n' % filename)
 365             return self._init_failed()
 366         ver = struct.unpack('!I', self.map[4:8])[0]
 367         if ver < BLOOM_VERSION:
 368             log('Warning: ignoring old-style (v%d) bloom %r\n'
 369                 % (ver, filename))
 370             return self._init_failed()
 371         if ver > BLOOM_VERSION:
 372             log('Warning: ignoring too-new (v%d) bloom %r\n'
 373                 % (ver, filename))
 374             return self._init_failed()
 375
 376         self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
 377         idxnamestr = str(self.map[16 + 2**self.bits:])
 378         if idxnamestr:
 379             self.idxnames = idxnamestr.split('\0')
 380         else:
 381             self.idxnames = []
 382
 383     def _init_failed(self):
 384         if self.map:
 385             self.map.close()
 386             self.map = None
 387         if self.rwfile:
 388             self.rwfile.close()
 389             self.rwfile = None
 390         self.idxnames = []
 391         self.bits = self.entries = 0
 392
 393     def valid(self):
 394         return self.map and self.bits
 395
 396     def __del__(self):
 397         self.close()
 398
 399     def close(self):
 400         if self.map:
 401             if self.rwfile:
 402                 debug2("bloom: closing with %d entries\n" % self.entries)
 403                 self.map[12:16] = struct.pack('!I', self.entries)
 404                 self.map.flush()
 405         if self.rwfile:
 406             self.rwfile.seek(16 + 2**self.bits)
 407             if self.idxnames:
 408                 self.rwfile.write('\0'.join(self.idxnames))
 409         self._init_failed()
 410
 411     def pfalse_positive(self, additional=0):
 412         n = self.entries + additional
 413         m = 8*2**self.bits
 414         k = self.k
 415         return 100*(1-math.exp(-k*float(n)/m))**k
 416
 417     def add_idx(self, ix):
 418         """Add the object to the filter, return current pfalse_positive."""
 419         if not self.map: raise Exception, "Cannot add to closed bloom"
 420         self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
 421         self.idxnames.append(os.path.basename(ix.name))
 422
 423     def exists(self, sha):
 424         """Return nonempty if the object probably exists in the bloom filter."""
 425         global _total_searches, _total_steps
 426         _total_searches += 1
 427         if not self.map: return None
 428         found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
 429         _total_steps += steps
 430         return found
 431
 432     @classmethod
 433     def create(cls, name, f=None, readwrite=False, expected=100000, k=None):
 434         """Create and return a bloom filter for `expected` entries."""
 435         bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
 436         k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
 437         if bits > MAX_BLOOM_BITS[k]:
 438             log('bloom: warning, max bits exceeded, non-optimal\n')
 439             bits = MAX_BLOOM_BITS[k]
 440         debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
 441         f = f or open(name, 'w+b')
 442         f.write('BLOM')
 443         f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
 444         assert(f.tell() == 16)
 445         f.write('\0'*2**bits)
 446         f.seek(0)
 447         return cls(name, f=f, readwrite=readwrite)
 448
 449     def __len__(self):
 450         return self.entries
 451
 452
 453 class PackMidx:
 454     """Wrapper which contains data from multiple index files.
 455     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 456     and make it possible for bup to expand Git's indexing capabilities to vast
 457     amounts of files.
 458     """
 459     def __init__(self, filename):
 460         self.name = filename
 461         self.force_keep = False
 462         assert(filename.endswith('.midx'))
 463         self.map = mmap_read(open(filename))
 464         if str(self.map[0:4]) != 'MIDX':
 465             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 466             self.force_keep = True
 467             return self._init_failed()
 468         ver = struct.unpack('!I', self.map[4:8])[0]
 469         if ver < MIDX_VERSION:
 470             log('Warning: ignoring old-style (v%d) midx %r\n'
 471                 % (ver, filename))
 472             self.force_keep = False  # old stuff is boring
 473             return self._init_failed()
 474         if ver > MIDX_VERSION:
 475             log('Warning: ignoring too-new (v%d) midx %r\n'
 476                 % (ver, filename))
 477             self.force_keep = True  # new stuff is exciting
 478             return self._init_failed()
 479
 480         self.bits = _helpers.firstword(self.map[8:12])
 481         self.entries = 2**self.bits
 482         self.fanout = buffer(self.map, 12, self.entries*4)
 483         self.sha_ofs = 12 + self.entries*4
 484         self.nsha = nsha = self._fanget(self.entries-1)
 485         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 486         self.whichlist = buffer(self.map, self.sha_ofs + nsha*20, nsha*4)
 487         self.idxname_ofs = self.sha_ofs + 24*nsha
 488         self.idxnames = str(self.map[self.idxname_ofs:]).split('\0')
 489
 490     def _init_failed(self):
 491         self.bits = 0
 492         self.entries = 1
 493         self.fanout = buffer('\0\0\0\0')
 494         self.shatable = buffer('\0'*20)
 495         self.idxnames = []
 496
 497     def _fanget(self, i):
 498         start = i*4
 499         s = self.fanout[start:start+4]
 500         return _helpers.firstword(s)
 501
 502     def _get(self, i):
 503         return str(self.shatable[i*20:(i+1)*20])
 504
 505     def _get_idx_i(self, i):
 506         return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
 507
 508     def _get_idxname(self, i):
 509         return self.idxnames[self._get_idx_i(i)]
 510
 511     def exists(self, hash, want_source=False):
 512         """Return nonempty if the object exists in the index files."""
 513         global _total_searches, _total_steps
 514         _total_searches += 1
 515         want = str(hash)
 516         el = extract_bits(want, self.bits)
 517         if el:
 518             start = self._fanget(el-1)
 519             startv = el << (32-self.bits)
 520         else:
 521             start = 0
 522             startv = 0
 523         end = self._fanget(el)
 524         endv = (el+1) << (32-self.bits)
 525         _total_steps += 1   # lookup table is a step
 526         hashv = _helpers.firstword(hash)
 527         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 528         while start < end:
 529             _total_steps += 1
 530             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 531             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 532             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 533             v = self._get(mid)
 534             #print '    %08x' % self._num(v)
 535             if v < want:
 536                 start = mid+1
 537                 startv = _helpers.firstword(v)
 538             elif v > want:
 539                 end = mid
 540                 endv = _helpers.firstword(v)
 541             else: # got it!
 542                 return want_source and self._get_idxname(mid) or True
 543         return None
 544
 545     def __iter__(self):
 546         for i in xrange(self._fanget(self.entries-1)):
 547             yield buffer(self.shatable, i*20, 20)
 548
 549     def __len__(self):
 550         return int(self._fanget(self.entries-1))
 551
 552
 553 _mpi_count = 0
 554 class PackIdxList:
 555     def __init__(self, dir):
 556         global _mpi_count
 557         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 558         _mpi_count += 1
 559         self.dir = dir
 560         self.also = set()
 561         self.packs = []
 562         self.do_bloom = False
 563         self.bloom = None
 564         self.refresh()
 565
 566     def __del__(self):
 567         global _mpi_count
 568         _mpi_count -= 1
 569         assert(_mpi_count == 0)
 570
 571     def __iter__(self):
 572         return iter(idxmerge(self.packs))
 573
 574     def __len__(self):
 575         return sum(len(pack) for pack in self.packs)
 576
 577     def exists(self, hash, want_source=False):
 578         """Return nonempty if the object exists in the index files."""
 579         global _total_searches
 580         _total_searches += 1
 581         if hash in self.also:
 582             return True
 583         if self.do_bloom and self.bloom is not None:
 584             _total_searches -= 1  # will be incremented by bloom
 585             if self.bloom.exists(hash):
 586                 self.do_bloom = False
 587             else:
 588                 return None
 589         for i in xrange(len(self.packs)):
 590             p = self.packs[i]
 591             _total_searches -= 1  # will be incremented by sub-pack
 592             ix = p.exists(hash, want_source=want_source)
 593             if ix:
 594                 # reorder so most recently used packs are searched first
 595                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 596                 return ix
 597         self.do_bloom = True
 598         return None
 599
 600     def refresh(self, skip_midx = False):
 601         """Refresh the index list.
 602         This method verifies if .midx files were superseded (e.g. all of its
 603         contents are in another, bigger .midx file) and removes the superseded
 604         files.
 605
 606         If skip_midx is True, all work on .midx files will be skipped and .midx
 607         files will be removed from the list.
 608
 609         The module-global variable 'ignore_midx' can force this function to
 610         always act as if skip_midx was True.
 611         """
 612         self.bloom = None # Always reopen the bloom as it may have been relaced
 613         self.do_bloom = False
 614         skip_midx = skip_midx or ignore_midx
 615         d = dict((p.name, p) for p in self.packs
 616                  if not skip_midx or not isinstance(p, PackMidx))
 617         if os.path.exists(self.dir):
 618             if not skip_midx:
 619                 midxl = []
 620                 for ix in self.packs:
 621                     if isinstance(ix, PackMidx):
 622                         for name in ix.idxnames:
 623                             d[os.path.join(self.dir, name)] = ix
 624                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 625                     if not d.get(full):
 626                         mx = PackMidx(full)
 627                         (mxd, mxf) = os.path.split(mx.name)
 628                         broken = False
 629                         for n in mx.idxnames:
 630                             if not os.path.exists(os.path.join(mxd, n)):
 631                                 log(('warning: index %s missing\n' +
 632                                     '  used by %s\n') % (n, mxf))
 633                                 broken = True
 634                         if broken:
 635                             del mx
 636                             unlink(full)
 637                         else:
 638                             midxl.append(mx)
 639                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 640                 for ix in midxl:
 641                     any_needed = False
 642                     for sub in ix.idxnames:
 643                         found = d.get(os.path.join(self.dir, sub))
 644                         if not found or isinstance(found, PackIdx):
 645                             # doesn't exist, or exists but not in a midx
 646                             any_needed = True
 647                             break
 648                     if any_needed:
 649                         d[ix.name] = ix
 650                         for name in ix.idxnames:
 651                             d[os.path.join(self.dir, name)] = ix
 652                     elif not ix.force_keep:
 653                         debug1('midx: removing redundant: %s\n'
 654                                % os.path.basename(ix.name))
 655                         unlink(ix.name)
 656             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 657                 if not d.get(full):
 658                     try:
 659                         ix = open_idx(full)
 660                     except GitError, e:
 661                         add_error(e)
 662                         continue
 663                     d[full] = ix
 664             bfull = os.path.join(self.dir, 'bup.bloom')
 665             if self.bloom is None and os.path.exists(bfull):
 666                 self.bloom = ShaBloom(bfull)
 667             self.packs = list(set(d.values()))
 668             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 669             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 670                 self.do_bloom = True
 671             else:
 672                 self.bloom = None
 673         debug1('PackIdxList: using %d index%s.\n'
 674             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 675
 676     def add(self, hash):
 677         """Insert an additional object in the list."""
 678         self.also.add(hash)
 679
 680
 681 def calc_hash(type, content):
 682     """Calculate some content's hash in the Git fashion."""
 683     header = '%s %d\0' % (type, len(content))
 684     sum = Sha1(header)
 685     sum.update(content)
 686     return sum.digest()
 687
 688
 689 def _shalist_sort_key(ent):
 690     (mode, name, id) = ent
 691     if stat.S_ISDIR(int(mode, 8)):
 692         return name + '/'
 693     else:
 694         return name
 695
 696
 697 def open_idx(filename):
 698     if filename.endswith('.idx'):
 699         f = open(filename, 'rb')
 700         header = f.read(8)
 701         if header[0:4] == '\377tOc':
 702             version = struct.unpack('!I', header[4:8])[0]
 703             if version == 2:
 704                 return PackIdxV2(filename, f)
 705             else:
 706                 raise GitError('%s: expected idx file version 2, got %d'
 707                                % (filename, version))
 708         elif len(header) == 8 and header[0:4] < '\377tOc':
 709             return PackIdxV1(filename, f)
 710         else:
 711             raise GitError('%s: unrecognized idx file header' % filename)
 712     elif filename.endswith('.midx'):
 713         return PackMidx(filename)
 714     else:
 715         raise GitError('idx filenames must end with .idx or .midx')
 716
 717
 718 def idxmerge(idxlist, final_progress=True):
 719     """Generate a list of all the objects reachable in a PackIdxList."""
 720     def pfunc(count, total):
 721         progress('Reading indexes: %.2f%% (%d/%d)\r'
 722                  % (count*100.0/total, count, total))
 723     def pfinal(count, total):
 724         if final_progress:
 725             log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 726     return merge_iter(idxlist, 10024, pfunc, pfinal)
 727
 728
 729 def _make_objcache():
 730     return PackIdxList(repo('objects/pack'))
 731
 732 class PackWriter:
 733     """Writes Git objects insid a pack file."""
 734     def __init__(self, objcache_maker=_make_objcache):
 735         self.count = 0
 736         self.outbytes = 0
 737         self.filename = None
 738         self.file = None
 739         self.idx = None
 740         self.objcache_maker = objcache_maker
 741         self.objcache = None
 742
 743     def __del__(self):
 744         self.close()
 745
 746     def _open(self):
 747         if not self.file:
 748             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 749             self.file = os.fdopen(fd, 'w+b')
 750             assert(name.endswith('.pack'))
 751             self.filename = name[:-5]
 752             self.file.write('PACK\0\0\0\2\0\0\0\0')
 753             self.idx = list(list() for i in xrange(256))
 754
 755     # the 'sha' parameter is used in client.py's _raw_write(), but not needed
 756     # in this basic version.
 757     def _raw_write(self, datalist, sha):
 758         self._open()
 759         f = self.file
 760         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 761         # the file never has a *partial* blob.  So let's make sure it's
 762         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 763         # to our hashsplit algorithm.)  f.write() does its own buffering,
 764         # but that's okay because we'll flush it in _end().
 765         oneblob = ''.join(datalist)
 766         try:
 767             f.write(oneblob)
 768         except IOError, e:
 769             raise GitError, e, sys.exc_info()[2]
 770         nw = len(oneblob)
 771         crc = zlib.crc32(oneblob) & 0xffffffff
 772         self._update_idx(sha, crc, nw)
 773         self.outbytes += nw
 774         self.count += 1
 775         return nw, crc
 776
 777     def _update_idx(self, sha, crc, size):
 778         assert(sha)
 779         if self.idx:
 780             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 781
 782     def _write(self, sha, type, content):
 783         if verbose:
 784             log('>')
 785         if not sha:
 786             sha = calc_hash(type, content)
 787         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 788         return sha
 789
 790     def breakpoint(self):
 791         """Clear byte and object counts and return the last processed id."""
 792         id = self._end()
 793         self.outbytes = self.count = 0
 794         return id
 795
 796     def _require_objcache(self):
 797         if self.objcache is None and self.objcache_maker:
 798             self.objcache = self.objcache_maker()
 799         if self.objcache is None:
 800             raise GitError(
 801                     "PackWriter not opened or can't check exists w/o objcache")
 802
 803     def exists(self, id, want_source=False):
 804         """Return non-empty if an object is found in the object cache."""
 805         self._require_objcache()
 806         return self.objcache.exists(id, want_source=want_source)
 807
 808     def maybe_write(self, type, content):
 809         """Write an object to the pack file if not present and return its id."""
 810         self._require_objcache()
 811         sha = calc_hash(type, content)
 812         if not self.exists(sha):
 813             self._write(sha, type, content)
 814             self.objcache.add(sha)
 815         return sha
 816
 817     def new_blob(self, blob):
 818         """Create a blob object in the pack with the supplied content."""
 819         return self.maybe_write('blob', blob)
 820
 821     def new_tree(self, shalist):
 822         """Create a tree object in the pack."""
 823         shalist = sorted(shalist, key = _shalist_sort_key)
 824         l = []
 825         for (mode,name,bin) in shalist:
 826             assert(mode)
 827             assert(mode != '0')
 828             assert(mode[0] != '0')
 829             assert(name)
 830             assert(len(bin) == 20)
 831             l.append('%s %s\0%s' % (mode,name,bin))
 832         return self.maybe_write('tree', ''.join(l))
 833
 834     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 835         l = []
 836         if tree: l.append('tree %s' % tree.encode('hex'))
 837         if parent: l.append('parent %s' % parent.encode('hex'))
 838         if author: l.append('author %s %s' % (author, _git_date(adate)))
 839         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 840         l.append('')
 841         l.append(msg)
 842         return self.maybe_write('commit', '\n'.join(l))
 843
 844     def new_commit(self, parent, tree, date, msg):
 845         """Create a commit object in the pack."""
 846         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 847         commit = self._new_commit(tree, parent,
 848                                   userline, date, userline, date,
 849                                   msg)
 850         return commit
 851
 852     def abort(self):
 853         """Remove the pack file from disk."""
 854         f = self.file
 855         if f:
 856             self.idx = None
 857             self.file = None
 858             f.close()
 859             os.unlink(self.filename + '.pack')
 860
 861     def _end(self, run_midx=True):
 862         f = self.file
 863         if not f: return None
 864         self.file = None
 865         self.objcache = None
 866         idx = self.idx
 867         self.idx = None
 868
 869         # update object count
 870         f.seek(8)
 871         cp = struct.pack('!i', self.count)
 872         assert(len(cp) == 4)
 873         f.write(cp)
 874
 875         # calculate the pack sha1sum
 876         f.seek(0)
 877         sum = Sha1()
 878         for b in chunkyreader(f):
 879             sum.update(b)
 880         packbin = sum.digest()
 881         f.write(packbin)
 882         f.close()
 883
 884         idx_f = open(self.filename + '.idx', 'wb')
 885         obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
 886         idx_f.close()
 887
 888         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 889         if os.path.exists(self.filename + '.map'):
 890             os.unlink(self.filename + '.map')
 891         os.rename(self.filename + '.pack', nameprefix + '.pack')
 892         os.rename(self.filename + '.idx', nameprefix + '.idx')
 893
 894         if run_midx:
 895             auto_midx(repo('objects/pack'))
 896         return nameprefix
 897
 898     def close(self, run_midx=True):
 899         """Close the pack file and move it to its definitive path."""
 900         return self._end(run_midx=run_midx)
 901
 902     def _write_pack_idx_v2(self, file, idx, packbin):
 903         sum = Sha1()
 904
 905         def write(data):
 906             file.write(data)
 907             sum.update(data)
 908
 909         write('\377tOc\0\0\0\2')
 910
 911         n = 0
 912         for part in idx:
 913             n += len(part)
 914             write(struct.pack('!i', n))
 915             part.sort(key=lambda x: x[0])
 916
 917         obj_list_sum = Sha1()
 918         for part in idx:
 919             for entry in part:
 920                 write(entry[0])
 921                 obj_list_sum.update(entry[0])
 922         for part in idx:
 923             for entry in part:
 924                 write(struct.pack('!I', entry[1]))
 925         ofs64_list = []
 926         for part in idx:
 927             for entry in part:
 928                 if entry[2] & 0x80000000:
 929                     write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
 930                     ofs64_list.append(struct.pack('!Q', entry[2]))
 931                 else:
 932                     write(struct.pack('!i', entry[2]))
 933         for ofs64 in ofs64_list:
 934             write(ofs64)
 935
 936         write(packbin)
 937         file.write(sum.digest())
 938         return obj_list_sum.hexdigest()
 939
 940
 941 def _git_date(date):
 942     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 943
 944
 945 def _gitenv():
 946     os.environ['GIT_DIR'] = os.path.abspath(repo())
 947
 948
 949 def list_refs(refname = None):
 950     """Generate a list of tuples in the form (refname,hash).
 951     If a ref name is specified, list only this particular ref.
 952     """
 953     argv = ['git', 'show-ref', '--']
 954     if refname:
 955         argv += [refname]
 956     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 957     out = p.stdout.read().strip()
 958     rv = p.wait()  # not fatal
 959     if rv:
 960         assert(not out)
 961     if out:
 962         for d in out.split('\n'):
 963             (sha, name) = d.split(' ', 1)
 964             yield (name, sha.decode('hex'))
 965
 966
 967 def read_ref(refname):
 968     """Get the commit id of the most recent commit made on a given ref."""
 969     l = list(list_refs(refname))
 970     if l:
 971         assert(len(l) == 1)
 972         return l[0][1]
 973     else:
 974         return None
 975
 976
 977 def rev_list(ref, count=None):
 978     """Generate a list of reachable commits in reverse chronological order.
 979
 980     This generator walks through commits, from child to parent, that are
 981     reachable via the specified ref and yields a series of tuples of the form
 982     (date,hash).
 983
 984     If count is a non-zero integer, limit the number of commits to "count"
 985     objects.
 986     """
 987     assert(not ref.startswith('-'))
 988     opts = []
 989     if count:
 990         opts += ['-n', str(atoi(count))]
 991     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 992     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 993     commit = None
 994     for row in p.stdout:
 995         s = row.strip()
 996         if s.startswith('commit '):
 997             commit = s[7:].decode('hex')
 998         else:
 999             date = int(s)
1000             yield (date, commit)
1001     rv = p.wait()  # not fatal
1002     if rv:
1003         raise GitError, 'git rev-list returned error %d' % rv
1004
1005
1006 def rev_get_date(ref):
1007     """Get the date of the latest commit on the specified ref."""
1008     for (date, commit) in rev_list(ref, count=1):
1009         return date
1010     raise GitError, 'no such commit %r' % ref
1011
1012
1013 def rev_parse(committish):
1014     """Resolve the full hash for 'committish', if it exists.
1015
1016     Should be roughly equivalent to 'git rev-parse'.
1017
1018     Returns the hex value of the hash if it is found, None if 'committish' does
1019     not correspond to anything.
1020     """
1021     head = read_ref(committish)
1022     if head:
1023         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1024         return head
1025
1026     pL = PackIdxList(repo('objects/pack'))
1027
1028     if len(committish) == 40:
1029         try:
1030             hash = committish.decode('hex')
1031         except TypeError:
1032             return None
1033
1034         if pL.exists(hash):
1035             return hash
1036
1037     return None
1038
1039
1040 def update_ref(refname, newval, oldval):
1041     """Change the commit pointed to by a branch."""
1042     if not oldval:
1043         oldval = ''
1044     assert(refname.startswith('refs/heads/'))
1045     p = subprocess.Popen(['git', 'update-ref', refname,
1046                           newval.encode('hex'), oldval.encode('hex')],
1047                          preexec_fn = _gitenv)
1048     _git_wait('git update-ref', p)
1049
1050
1051 def guess_repo(path=None):
1052     """Set the path value in the global variable "repodir".
1053     This makes bup look for an existing bup repository, but not fail if a
1054     repository doesn't exist. Usually, if you are interacting with a bup
1055     repository, you would not be calling this function but using
1056     check_repo_or_die().
1057     """
1058     global repodir
1059     if path:
1060         repodir = path
1061     if not repodir:
1062         repodir = os.environ.get('BUP_DIR')
1063         if not repodir:
1064             repodir = os.path.expanduser('~/.bup')
1065
1066
1067 def init_repo(path=None):
1068     """Create the Git bare repository for bup in a given path."""
1069     guess_repo(path)
1070     d = repo()  # appends a / to the path
1071     parent = os.path.dirname(os.path.dirname(d))
1072     if parent and not os.path.exists(parent):
1073         raise GitError('parent directory "%s" does not exist\n' % parent)
1074     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1075         raise GitError('"%d" exists but is not a directory\n' % d)
1076     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1077                          preexec_fn = _gitenv)
1078     _git_wait('git init', p)
1079     # Force the index version configuration in order to ensure bup works
1080     # regardless of the version of the installed Git binary.
1081     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1082                          stdout=sys.stderr, preexec_fn = _gitenv)
1083     _git_wait('git config', p)
1084
1085
1086 def check_repo_or_die(path=None):
1087     """Make sure a bup repository exists, and abort if not.
1088     If the path to a particular repository was not specified, this function
1089     initializes the default repository automatically.
1090     """
1091     guess_repo(path)
1092     if not os.path.isdir(repo('objects/pack/.')):
1093         if repodir == home_repodir:
1094             init_repo()
1095         else:
1096             log('error: %r is not a bup/git repository\n' % repo())
1097             sys.exit(15)
1098
1099
1100 def treeparse(buf):
1101     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1102     ofs = 0
1103     while ofs < len(buf):
1104         z = buf[ofs:].find('\0')
1105         assert(z > 0)
1106         spl = buf[ofs:ofs+z].split(' ', 1)
1107         assert(len(spl) == 2)
1108         sha = buf[ofs+z+1:ofs+z+1+20]
1109         ofs += z+1+20
1110         yield (spl[0], spl[1], sha)
1111
1112
1113 _ver = None
1114 def ver():
1115     """Get Git's version and ensure a usable version is installed.
1116
1117     The returned version is formatted as an ordered tuple with each position
1118     representing a digit in the version tag. For example, the following tuple
1119     would represent version 1.6.6.9:
1120
1121         ('1', '6', '6', '9')
1122     """
1123     global _ver
1124     if not _ver:
1125         p = subprocess.Popen(['git', '--version'],
1126                              stdout=subprocess.PIPE)
1127         gvs = p.stdout.read()
1128         _git_wait('git --version', p)
1129         m = re.match(r'git version (\S+.\S+)', gvs)
1130         if not m:
1131             raise GitError('git --version weird output: %r' % gvs)
1132         _ver = tuple(m.group(1).split('.'))
1133     needed = ('1','5', '3', '1')
1134     if _ver < needed:
1135         raise GitError('git version %s or higher is required; you have %s'
1136                        % ('.'.join(needed), '.'.join(_ver)))
1137     return _ver
1138
1139
1140 def _git_wait(cmd, p):
1141     rv = p.wait()
1142     if rv != 0:
1143         raise GitError('%s returned %d' % (cmd, rv))
1144
1145
1146 def _git_capture(argv):
1147     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1148     r = p.stdout.read()
1149     _git_wait(repr(argv), p)
1150     return r
1151
1152
1153 class _AbortableIter:
1154     def __init__(self, it, onabort = None):
1155         self.it = it
1156         self.onabort = onabort
1157         self.done = None
1158
1159     def __iter__(self):
1160         return self
1161
1162     def next(self):
1163         try:
1164             return self.it.next()
1165         except StopIteration, e:
1166             self.done = True
1167             raise
1168         except:
1169             self.abort()
1170             raise
1171
1172     def abort(self):
1173         """Abort iteration and call the abortion callback, if needed."""
1174         if not self.done:
1175             self.done = True
1176             if self.onabort:
1177                 self.onabort()
1178
1179     def __del__(self):
1180         self.abort()
1181
1182
1183 _ver_warned = 0
1184 class CatPipe:
1185     """Link to 'git cat-file' that is used to retrieve blob data."""
1186     def __init__(self):
1187         global _ver_warned
1188         wanted = ('1','5','6')
1189         if ver() < wanted:
1190             if not _ver_warned:
1191                 log('warning: git version < %s; bup will be slow.\n'
1192                     % '.'.join(wanted))
1193                 _ver_warned = 1
1194             self.get = self._slow_get
1195         else:
1196             self.p = self.inprogress = None
1197             self.get = self._fast_get
1198
1199     def _abort(self):
1200         if self.p:
1201             self.p.stdout.close()
1202             self.p.stdin.close()
1203         self.p = None
1204         self.inprogress = None
1205
1206     def _restart(self):
1207         self._abort()
1208         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1209                                   stdin=subprocess.PIPE,
1210                                   stdout=subprocess.PIPE,
1211                                   close_fds = True,
1212                                   bufsize = 4096,
1213                                   preexec_fn = _gitenv)
1214
1215     def _fast_get(self, id):
1216         if not self.p or self.p.poll() != None:
1217             self._restart()
1218         assert(self.p)
1219         assert(self.p.poll() == None)
1220         if self.inprogress:
1221             log('_fast_get: opening %r while %r is open'
1222                 % (id, self.inprogress))
1223         assert(not self.inprogress)
1224         assert(id.find('\n') < 0)
1225         assert(id.find('\r') < 0)
1226         assert(not id.startswith('-'))
1227         self.inprogress = id
1228         self.p.stdin.write('%s\n' % id)
1229         self.p.stdin.flush()
1230         hdr = self.p.stdout.readline()
1231         if hdr.endswith(' missing\n'):
1232             self.inprogress = None
1233             raise KeyError('blob %r is missing' % id)
1234         spl = hdr.split(' ')
1235         if len(spl) != 3 or len(spl[0]) != 40:
1236             raise GitError('expected blob, got %r' % spl)
1237         (hex, type, size) = spl
1238
1239         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1240                            onabort = self._abort)
1241         try:
1242             yield type
1243             for blob in it:
1244                 yield blob
1245             assert(self.p.stdout.readline() == '\n')
1246             self.inprogress = None
1247         except Exception, e:
1248             it.abort()
1249             raise
1250
1251     def _slow_get(self, id):
1252         assert(id.find('\n') < 0)
1253         assert(id.find('\r') < 0)
1254         assert(id[0] != '-')
1255         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1256         yield type
1257
1258         p = subprocess.Popen(['git', 'cat-file', type, id],
1259                              stdout=subprocess.PIPE,
1260                              preexec_fn = _gitenv)
1261         for blob in chunkyreader(p.stdout):
1262             yield blob
1263         _git_wait('git cat-file', p)
1264
1265     def _join(self, it):
1266         type = it.next()
1267         if type == 'blob':
1268             for blob in it:
1269                 yield blob
1270         elif type == 'tree':
1271             treefile = ''.join(it)
1272             for (mode, name, sha) in treeparse(treefile):
1273                 for blob in self.join(sha.encode('hex')):
1274                     yield blob
1275         elif type == 'commit':
1276             treeline = ''.join(it).split('\n')[0]
1277             assert(treeline.startswith('tree '))
1278             for blob in self.join(treeline[5:]):
1279                 yield blob
1280         else:
1281             raise GitError('invalid object type %r: expected blob/tree/commit'
1282                            % type)
1283
1284     def join(self, id):
1285         """Generate a list of the content of all blobs that can be reached
1286         from an object.  The hash given in 'id' must point to a blob, a tree
1287         or a commit. The content of all blobs that can be seen from trees or
1288         commits will be added to the list.
1289         """
1290         try:
1291             for d in self._join(self.get(id)):
1292                 yield d
1293         except StopIteration:
1294             log('booger!\n')
1295
1296 def tags():
1297     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1298     tags = {}
1299     for (n,c) in list_refs():
1300         if n.startswith('refs/tags/'):
1301             name = n[10:]
1302             if not c in tags:
1303                 tags[c] = []
1304
1305             tags[c].append(name)  # more than one tag can point at 'c'
1306
1307     return tags