lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path
   8
   9 MIDX_VERSION = 4
  10
  11 """Discussion of bloom constants for bup:
  12
  13 There are four basic things to consider when building a bloom filter:
  14 The size, in bits, of the filter
  15 The capacity, in entries, of the filter
  16 The probability of a false positive that is tolerable
  17 The number of bits readily available to use for addresing filter bits
  18
  19 There is one major tunable that is not directly related to the above:
  20 k: the number of bits set in the filter per entry
  21
  22 Here's a wall of numbers showing the relationship between k; the ratio between
  23 the filter size in bits and the entries in the filter; and pfalse_positive:
  24
  25 mn|k=3    |k=4    |k=5    |k=6    |k=7    |k=8    |k=9    |k=10   |k=11
  26  8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
  27  9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
  28 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
  29 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
  30 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
  31 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
  32 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
  33 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
  34 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
  35 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
  36 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
  37 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
  38 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
  39 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
  40 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
  41 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
  42 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
  43 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
  44 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
  45 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
  46 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
  47 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
  48 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
  49 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
  50 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
  51
  52 Here's a table showing available repository size for a given pfalse_positive
  53 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
  54 filter and 8192bytes per object):
  55
  56 pfalse|obj k=4     |cap k=4    |obj k=5  |cap k=5    |obj k=6 |cap k=6
  57 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
  58 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
  59 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
  60
  61 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
  62 addressing.
  63
  64 filter size scales linearly with repository size for a given k and pfalse.
  65
  66 Here's a table of filter sizes for a 1 TiB repository:
  67
  68 pfalse| k=3        | k=4        | k=5        | k=6
  69 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
  70 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
  71 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
  72
  73 For bup:
  74 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
  75 per lookup will be worse than the two required for midx.
  76 * We want the pfalse_positive to be low enough that the cost of sometimes
  77 faulting on the midx doesn't overcome the benefit of the bloom filter.
  78 * We have readily available 160 bits for addressing the filter.
  79 * We want to be able to have a single bloom address entire repositories of
  80 reasonable size.
  81
  82 Based on these parameters, a combination of k=4 and k=5 provides the behavior
  83 that bup needs.  As such, I've implemented bloom addressing, adding and
  84 checking functions in C for these two values.  Because k=5 requires less space
  85 and gives better overall pfalse_positive perofrmance, it is preferred if a
  86 table with k=5 can represent the repository.
  87
  88 None of this tells us what max_pfalse_positive to choose.
  89
  90 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
  91 """
  92 BLOOM_VERSION = 2
  93 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
  94 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
  95 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
  96
  97 verbose = 0
  98 ignore_midx = 0
  99 home_repodir = os.path.expanduser('~/.bup')
 100 repodir = None
 101
 102 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
 103 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
 104
 105 _total_searches = 0
 106 _total_steps = 0
 107
 108
 109 class GitError(Exception):
 110     pass
 111
 112
 113 def repo(sub = ''):
 114     """Get the path to the git repository or one of its subdirectories."""
 115     global repodir
 116     if not repodir:
 117         raise GitError('You should call check_repo_or_die()')
 118
 119     # If there's a .git subdirectory, then the actual repo is in there.
 120     gd = os.path.join(repodir, '.git')
 121     if os.path.exists(gd):
 122         repodir = gd
 123
 124     return os.path.join(repodir, sub)
 125
 126
 127 def auto_midx(objdir):
 128     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 129     try:
 130         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 131     except OSError, e:
 132         # make sure 'args' gets printed to help with debugging
 133         add_error('%r: exception: %s' % (args, e))
 134         raise
 135     if rv:
 136         add_error('%r: returned %d' % (args, rv))
 137
 138     args = [path.exe(), 'bloom', '--dir', objdir]
 139     try:
 140         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 141     except OSError, e:
 142         # make sure 'args' gets printed to help with debugging
 143         add_error('%r: exception: %s' % (args, e))
 144         raise
 145     if rv:
 146         add_error('%r: returned %d' % (args, rv))
 147
 148
 149 def mangle_name(name, mode, gitmode):
 150     """Mangle a file name to present an abstract name for segmented files.
 151     Mangled file names will have the ".bup" extension added to them. If a
 152     file's name already ends with ".bup", a ".bupl" extension is added to
 153     disambiguate normal files from semgmented ones.
 154     """
 155     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 156         return name + '.bup'
 157     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 158         return name + '.bupl'
 159     else:
 160         return name
 161
 162
 163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 164 def demangle_name(name):
 165     """Remove name mangling from a file name, if necessary.
 166
 167     The return value is a tuple (demangled_filename,mode), where mode is one of
 168     the following:
 169
 170     * BUP_NORMAL  : files that should be read as-is from the repository
 171     * BUP_CHUNKED : files that were chunked and need to be assembled
 172
 173     For more information on the name mangling algorythm, see mangle_name()
 174     """
 175     if name.endswith('.bupl'):
 176         return (name[:-5], BUP_NORMAL)
 177     elif name.endswith('.bup'):
 178         return (name[:-4], BUP_CHUNKED)
 179     else:
 180         return (name, BUP_NORMAL)
 181
 182
 183 def _encode_packobj(type, content):
 184     szout = ''
 185     sz = len(content)
 186     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 187     sz >>= 4
 188     while 1:
 189         if sz: szbits |= 0x80
 190         szout += chr(szbits)
 191         if not sz:
 192             break
 193         szbits = sz & 0x7f
 194         sz >>= 7
 195     z = zlib.compressobj(1)
 196     yield szout
 197     yield z.compress(content)
 198     yield z.flush()
 199
 200
 201 def _encode_looseobj(type, content):
 202     z = zlib.compressobj(1)
 203     yield z.compress('%s %d\0' % (type, len(content)))
 204     yield z.compress(content)
 205     yield z.flush()
 206
 207
 208 def _decode_looseobj(buf):
 209     assert(buf);
 210     s = zlib.decompress(buf)
 211     i = s.find('\0')
 212     assert(i > 0)
 213     l = s[:i].split(' ')
 214     type = l[0]
 215     sz = int(l[1])
 216     content = s[i+1:]
 217     assert(type in _typemap)
 218     assert(sz == len(content))
 219     return (type, content)
 220
 221
 222 def _decode_packobj(buf):
 223     assert(buf)
 224     c = ord(buf[0])
 225     type = _typermap[(c & 0x70) >> 4]
 226     sz = c & 0x0f
 227     shift = 4
 228     i = 0
 229     while c & 0x80:
 230         i += 1
 231         c = ord(buf[i])
 232         sz |= (c & 0x7f) << shift
 233         shift += 7
 234         if not (c & 0x80):
 235             break
 236     return (type, zlib.decompress(buf[i+1:]))
 237
 238
 239 class PackIdx:
 240     def __init__(self):
 241         assert(0)
 242
 243     def find_offset(self, hash):
 244         """Get the offset of an object inside the index file."""
 245         idx = self._idx_from_hash(hash)
 246         if idx != None:
 247             return self._ofs_from_idx(idx)
 248         return None
 249
 250     def exists(self, hash, want_source=False):
 251         """Return nonempty if the object exists in this index."""
 252         if hash and (self._idx_from_hash(hash) != None):
 253             return want_source and self.name or True
 254         return None
 255
 256     def __len__(self):
 257         return int(self.fanout[255])
 258
 259     def _idx_from_hash(self, hash):
 260         global _total_searches, _total_steps
 261         _total_searches += 1
 262         assert(len(hash) == 20)
 263         b1 = ord(hash[0])
 264         start = self.fanout[b1-1] # range -1..254
 265         end = self.fanout[b1] # range 0..255
 266         want = str(hash)
 267         _total_steps += 1  # lookup table is a step
 268         while start < end:
 269             _total_steps += 1
 270             mid = start + (end-start)/2
 271             v = self._idx_to_hash(mid)
 272             if v < want:
 273                 start = mid+1
 274             elif v > want:
 275                 end = mid
 276             else: # got it!
 277                 return mid
 278         return None
 279
 280
 281 class PackIdxV1(PackIdx):
 282     """Object representation of a Git pack index (version 1) file."""
 283     def __init__(self, filename, f):
 284         self.name = filename
 285         self.idxnames = [self.name]
 286         self.map = mmap_read(f)
 287         self.fanout = list(struct.unpack('!256I',
 288                                          str(buffer(self.map, 0, 256*4))))
 289         self.fanout.append(0)  # entry "-1"
 290         nsha = self.fanout[255]
 291         self.sha_ofs = 256*4
 292         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 293
 294     def _ofs_from_idx(self, idx):
 295         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 296
 297     def _idx_to_hash(self, idx):
 298         return str(self.shatable[idx*24+4 : idx*24+24])
 299
 300     def __iter__(self):
 301         for i in xrange(self.fanout[255]):
 302             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 303
 304
 305 class PackIdxV2(PackIdx):
 306     """Object representation of a Git pack index (version 2) file."""
 307     def __init__(self, filename, f):
 308         self.name = filename
 309         self.idxnames = [self.name]
 310         self.map = mmap_read(f)
 311         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 312         self.fanout = list(struct.unpack('!256I',
 313                                          str(buffer(self.map, 8, 256*4))))
 314         self.fanout.append(0)  # entry "-1"
 315         nsha = self.fanout[255]
 316         self.sha_ofs = 8 + 256*4
 317         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 318         self.ofstable = buffer(self.map,
 319                                self.sha_ofs + nsha*20 + nsha*4,
 320                                nsha*4)
 321         self.ofs64table = buffer(self.map,
 322                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 323
 324     def _ofs_from_idx(self, idx):
 325         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 326         if ofs & 0x80000000:
 327             idx64 = ofs & 0x7fffffff
 328             ofs = struct.unpack('!Q',
 329                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 330         return ofs
 331
 332     def _idx_to_hash(self, idx):
 333         return str(self.shatable[idx*20:(idx+1)*20])
 334
 335     def __iter__(self):
 336         for i in xrange(self.fanout[255]):
 337             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 338
 339
 340 extract_bits = _helpers.extract_bits
 341
 342 bloom_contains = _helpers.bloom_contains
 343 bloom_add = _helpers.bloom_add
 344
 345
 346 class ShaBloom:
 347     """Wrapper which contains data from multiple index files.
 348     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 349     and make it possible for bup to expand Git's indexing capabilities to vast
 350     amounts of files.
 351     """
 352     def __init__(self, filename, f=None, readwrite=False):
 353         self.name = filename
 354         assert(filename.endswith('.bloom'))
 355         if readwrite:
 356             self.rwfile = f or open(filename, 'r+b')
 357             self.map = mmap_readwrite(self.rwfile, close=False)
 358         else:
 359             self.rwfile = None
 360             self.map = mmap_read(f or open(filename, 'rb'))
 361         if str(self.map[0:4]) != 'BLOM':
 362             log('Warning: skipping: invalid BLOM header in %r\n' % filename)
 363             return self._init_failed()
 364         ver = struct.unpack('!I', self.map[4:8])[0]
 365         if ver < BLOOM_VERSION:
 366             log('Warning: ignoring old-style (v%d) bloom %r\n'
 367                 % (ver, filename))
 368             return self._init_failed()
 369         if ver > BLOOM_VERSION:
 370             log('Warning: ignoring too-new (v%d) bloom %r\n'
 371                 % (ver, filename))
 372             return self._init_failed()
 373
 374         self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
 375         idxnamestr = str(self.map[16 + 2**self.bits:])
 376         if idxnamestr:
 377             self.idxnames = idxnamestr.split('\0')
 378         else:
 379             self.idxnames = []
 380
 381     def _init_failed(self):
 382         if self.map:
 383             self.map.close()
 384             self.map = None
 385         if self.rwfile:
 386             self.rwfile.close()
 387             self.rwfile = None
 388         self.idxnames = []
 389         self.bits = self.entries = 0
 390
 391     def valid(self):
 392         return self.map and self.bits
 393
 394     def __del__(self):
 395         self.close()
 396
 397     def close(self):
 398         if self.map:
 399             if self.rwfile:
 400                 debug2("bloom: closing with %d entries\n" % self.entries)
 401                 self.map[12:16] = struct.pack('!I', self.entries)
 402                 self.map.flush()
 403         if self.rwfile:
 404             self.rwfile.seek(16 + 2**self.bits)
 405             if self.idxnames:
 406                 self.rwfile.write('\0'.join(self.idxnames))
 407         self._init_failed()
 408
 409     def pfalse_positive(self, additional=0):
 410         n = self.entries + additional
 411         m = 8*2**self.bits
 412         k = self.k
 413         return 100*(1-math.exp(-k*float(n)/m))**k
 414
 415     def add_idx(self, ix):
 416         """Add the object to the filter, return current pfalse_positive."""
 417         if not self.map: raise Exception, "Cannot add to closed bloom"
 418         self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
 419         self.idxnames.append(os.path.basename(ix.name))
 420
 421     def exists(self, sha):
 422         """Return nonempty if the object probably exists in the bloom filter."""
 423         global _total_searches, _total_steps
 424         _total_searches += 1
 425         if not self.map: return None
 426         found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
 427         _total_steps += steps
 428         return found
 429
 430     @classmethod
 431     def create(cls, name, f=None, readwrite=False, expected=100000, k=None):
 432         """Create and return a bloom filter for `expected` entries."""
 433         bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
 434         k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
 435         if bits > MAX_BLOOM_BITS[k]:
 436             log('bloom: warning, max bits exceeded, non-optimal\n')
 437             bits = MAX_BLOOM_BITS[k]
 438         debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
 439         f = f or open(name, 'w+b')
 440         f.write('BLOM')
 441         f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
 442         assert(f.tell() == 16)
 443         f.write('\0'*2**bits)
 444         f.seek(0)
 445         return cls(name, f=f, readwrite=readwrite)
 446
 447     def __len__(self):
 448         return self.entries
 449
 450
 451 class PackMidx:
 452     """Wrapper which contains data from multiple index files.
 453     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 454     and make it possible for bup to expand Git's indexing capabilities to vast
 455     amounts of files.
 456     """
 457     def __init__(self, filename):
 458         self.name = filename
 459         self.force_keep = False
 460         assert(filename.endswith('.midx'))
 461         self.map = mmap_read(open(filename))
 462         if str(self.map[0:4]) != 'MIDX':
 463             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 464             self.force_keep = True
 465             return self._init_failed()
 466         ver = struct.unpack('!I', self.map[4:8])[0]
 467         if ver < MIDX_VERSION:
 468             log('Warning: ignoring old-style (v%d) midx %r\n'
 469                 % (ver, filename))
 470             self.force_keep = False  # old stuff is boring
 471             return self._init_failed()
 472         if ver > MIDX_VERSION:
 473             log('Warning: ignoring too-new (v%d) midx %r\n'
 474                 % (ver, filename))
 475             self.force_keep = True  # new stuff is exciting
 476             return self._init_failed()
 477
 478         self.bits = _helpers.firstword(self.map[8:12])
 479         self.entries = 2**self.bits
 480         self.fanout = buffer(self.map, 12, self.entries*4)
 481         self.sha_ofs = 12 + self.entries*4
 482         self.nsha = nsha = self._fanget(self.entries-1)
 483         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 484         self.whichlist = buffer(self.map, self.sha_ofs + nsha*20, nsha*4)
 485         self.idxname_ofs = self.sha_ofs + 24*nsha
 486         self.idxnames = str(self.map[self.idxname_ofs:]).split('\0')
 487
 488     def _init_failed(self):
 489         self.bits = 0
 490         self.entries = 1
 491         self.fanout = buffer('\0\0\0\0')
 492         self.shatable = buffer('\0'*20)
 493         self.idxnames = []
 494
 495     def _fanget(self, i):
 496         start = i*4
 497         s = self.fanout[start:start+4]
 498         return _helpers.firstword(s)
 499
 500     def _get(self, i):
 501         return str(self.shatable[i*20:(i+1)*20])
 502
 503     def _get_idx_i(self, i):
 504         return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
 505
 506     def _get_idxname(self, i):
 507         return self.idxnames[self._get_idx_i(i)]
 508
 509     def exists(self, hash, want_source=False):
 510         """Return nonempty if the object exists in the index files."""
 511         global _total_searches, _total_steps
 512         _total_searches += 1
 513         want = str(hash)
 514         el = extract_bits(want, self.bits)
 515         if el:
 516             start = self._fanget(el-1)
 517             startv = el << (32-self.bits)
 518         else:
 519             start = 0
 520             startv = 0
 521         end = self._fanget(el)
 522         endv = (el+1) << (32-self.bits)
 523         _total_steps += 1   # lookup table is a step
 524         hashv = _helpers.firstword(hash)
 525         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 526         while start < end:
 527             _total_steps += 1
 528             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 529             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 530             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 531             v = self._get(mid)
 532             #print '    %08x' % self._num(v)
 533             if v < want:
 534                 start = mid+1
 535                 startv = _helpers.firstword(v)
 536             elif v > want:
 537                 end = mid
 538                 endv = _helpers.firstword(v)
 539             else: # got it!
 540                 return want_source and self._get_idxname(mid) or True
 541         return None
 542
 543     def __iter__(self):
 544         for i in xrange(self._fanget(self.entries-1)):
 545             yield buffer(self.shatable, i*20, 20)
 546
 547     def __len__(self):
 548         return int(self._fanget(self.entries-1))
 549
 550
 551 _mpi_count = 0
 552 class PackIdxList:
 553     def __init__(self, dir):
 554         global _mpi_count
 555         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 556         _mpi_count += 1
 557         self.dir = dir
 558         self.also = set()
 559         self.packs = []
 560         self.do_bloom = False
 561         self.bloom = None
 562         self.refresh()
 563
 564     def __del__(self):
 565         global _mpi_count
 566         _mpi_count -= 1
 567         assert(_mpi_count == 0)
 568
 569     def __iter__(self):
 570         return iter(idxmerge(self.packs))
 571
 572     def __len__(self):
 573         return sum(len(pack) for pack in self.packs)
 574
 575     def exists(self, hash, want_source=False):
 576         """Return nonempty if the object exists in the index files."""
 577         global _total_searches
 578         _total_searches += 1
 579         if hash in self.also:
 580             return True
 581         if self.do_bloom and self.bloom is not None:
 582             _total_searches -= 1  # will be incremented by bloom
 583             if self.bloom.exists(hash):
 584                 self.do_bloom = False
 585             else:
 586                 return None
 587         for i in xrange(len(self.packs)):
 588             p = self.packs[i]
 589             _total_searches -= 1  # will be incremented by sub-pack
 590             ix = p.exists(hash, want_source=want_source)
 591             if ix:
 592                 # reorder so most recently used packs are searched first
 593                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 594                 return ix
 595         self.do_bloom = True
 596         return None
 597
 598     def refresh(self, skip_midx = False):
 599         """Refresh the index list.
 600         This method verifies if .midx files were superseded (e.g. all of its
 601         contents are in another, bigger .midx file) and removes the superseded
 602         files.
 603
 604         If skip_midx is True, all work on .midx files will be skipped and .midx
 605         files will be removed from the list.
 606
 607         The module-global variable 'ignore_midx' can force this function to
 608         always act as if skip_midx was True.
 609         """
 610         self.bloom = None # Always reopen the bloom as it may have been relaced
 611         self.do_bloom = False
 612         skip_midx = skip_midx or ignore_midx
 613         d = dict((p.name, p) for p in self.packs
 614                  if not skip_midx or not isinstance(p, PackMidx))
 615         if os.path.exists(self.dir):
 616             if not skip_midx:
 617                 midxl = []
 618                 for ix in self.packs:
 619                     if isinstance(ix, PackMidx):
 620                         for name in ix.idxnames:
 621                             d[os.path.join(self.dir, name)] = ix
 622                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 623                     if not d.get(full):
 624                         mx = PackMidx(full)
 625                         (mxd, mxf) = os.path.split(mx.name)
 626                         broken = False
 627                         for n in mx.idxnames:
 628                             if not os.path.exists(os.path.join(mxd, n)):
 629                                 log(('warning: index %s missing\n' +
 630                                     '  used by %s\n') % (n, mxf))
 631                                 broken = True
 632                         if broken:
 633                             del mx
 634                             unlink(full)
 635                         else:
 636                             midxl.append(mx)
 637                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 638                 for ix in midxl:
 639                     any_needed = False
 640                     for sub in ix.idxnames:
 641                         found = d.get(os.path.join(self.dir, sub))
 642                         if not found or isinstance(found, PackIdx):
 643                             # doesn't exist, or exists but not in a midx
 644                             any_needed = True
 645                             break
 646                     if any_needed:
 647                         d[ix.name] = ix
 648                         for name in ix.idxnames:
 649                             d[os.path.join(self.dir, name)] = ix
 650                     elif not ix.force_keep:
 651                         debug1('midx: removing redundant: %s\n'
 652                                % os.path.basename(ix.name))
 653                         unlink(ix.name)
 654             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 655                 if not d.get(full):
 656                     try:
 657                         ix = open_idx(full)
 658                     except GitError, e:
 659                         add_error(e)
 660                         continue
 661                     d[full] = ix
 662             bfull = os.path.join(self.dir, 'bup.bloom')
 663             if self.bloom is None and os.path.exists(bfull):
 664                 self.bloom = ShaBloom(bfull)
 665             self.packs = list(set(d.values()))
 666             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 667             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 668                 self.do_bloom = True
 669             else:
 670                 self.bloom = None
 671         debug1('PackIdxList: using %d index%s.\n'
 672             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 673
 674     def add(self, hash):
 675         """Insert an additional object in the list."""
 676         self.also.add(hash)
 677
 678
 679 def calc_hash(type, content):
 680     """Calculate some content's hash in the Git fashion."""
 681     header = '%s %d\0' % (type, len(content))
 682     sum = Sha1(header)
 683     sum.update(content)
 684     return sum.digest()
 685
 686
 687 def _shalist_sort_key(ent):
 688     (mode, name, id) = ent
 689     if stat.S_ISDIR(int(mode, 8)):
 690         return name + '/'
 691     else:
 692         return name
 693
 694
 695 def open_idx(filename):
 696     if filename.endswith('.idx'):
 697         f = open(filename, 'rb')
 698         header = f.read(8)
 699         if header[0:4] == '\377tOc':
 700             version = struct.unpack('!I', header[4:8])[0]
 701             if version == 2:
 702                 return PackIdxV2(filename, f)
 703             else:
 704                 raise GitError('%s: expected idx file version 2, got %d'
 705                                % (filename, version))
 706         elif len(header) == 8 and header[0:4] < '\377tOc':
 707             return PackIdxV1(filename, f)
 708         else:
 709             raise GitError('%s: unrecognized idx file header' % filename)
 710     elif filename.endswith('.midx'):
 711         return PackMidx(filename)
 712     else:
 713         raise GitError('idx filenames must end with .idx or .midx')
 714
 715
 716 def idxmerge(idxlist, final_progress=True):
 717     """Generate a list of all the objects reachable in a PackIdxList."""
 718     def pfunc(count, total):
 719         progress('Reading indexes: %.2f%% (%d/%d)\r'
 720                  % (count*100.0/total, count, total))
 721     def pfinal(count, total):
 722         if final_progress:
 723             log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 724     return merge_iter(idxlist, 10024, pfunc, pfinal)
 725
 726
 727 def _make_objcache():
 728     return PackIdxList(repo('objects/pack'))
 729
 730 class PackWriter:
 731     """Writes Git objects insid a pack file."""
 732     def __init__(self, objcache_maker=_make_objcache):
 733         self.count = 0
 734         self.outbytes = 0
 735         self.filename = None
 736         self.file = None
 737         self.idx = None
 738         self.objcache_maker = objcache_maker
 739         self.objcache = None
 740
 741     def __del__(self):
 742         self.close()
 743
 744     def _open(self):
 745         if not self.file:
 746             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 747             self.file = os.fdopen(fd, 'w+b')
 748             assert(name.endswith('.pack'))
 749             self.filename = name[:-5]
 750             self.file.write('PACK\0\0\0\2\0\0\0\0')
 751             self.idx = list(list() for i in xrange(256))
 752
 753     # the 'sha' parameter is used in client.py's _raw_write(), but not needed
 754     # in this basic version.
 755     def _raw_write(self, datalist, sha):
 756         self._open()
 757         f = self.file
 758         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 759         # the file never has a *partial* blob.  So let's make sure it's
 760         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 761         # to our hashsplit algorithm.)  f.write() does its own buffering,
 762         # but that's okay because we'll flush it in _end().
 763         oneblob = ''.join(datalist)
 764         try:
 765             f.write(oneblob)
 766         except IOError, e:
 767             raise GitError, e, sys.exc_info()[2]
 768         nw = len(oneblob)
 769         crc = zlib.crc32(oneblob) & 0xffffffff
 770         self._update_idx(sha, crc, nw)
 771         self.outbytes += nw
 772         self.count += 1
 773         return nw, crc
 774
 775     def _update_idx(self, sha, crc, size):
 776         assert(sha)
 777         if self.idx:
 778             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 779
 780     def _write(self, sha, type, content):
 781         if verbose:
 782             log('>')
 783         if not sha:
 784             sha = calc_hash(type, content)
 785         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 786         return sha
 787
 788     def breakpoint(self):
 789         """Clear byte and object counts and return the last processed id."""
 790         id = self._end()
 791         self.outbytes = self.count = 0
 792         return id
 793
 794     def _require_objcache(self):
 795         if self.objcache is None and self.objcache_maker:
 796             self.objcache = self.objcache_maker()
 797         if self.objcache is None:
 798             raise GitError(
 799                     "PackWriter not opened or can't check exists w/o objcache")
 800
 801     def exists(self, id, want_source=False):
 802         """Return non-empty if an object is found in the object cache."""
 803         self._require_objcache()
 804         return self.objcache.exists(id, want_source=want_source)
 805
 806     def maybe_write(self, type, content):
 807         """Write an object to the pack file if not present and return its id."""
 808         self._require_objcache()
 809         sha = calc_hash(type, content)
 810         if not self.exists(sha):
 811             self._write(sha, type, content)
 812             self.objcache.add(sha)
 813         return sha
 814
 815     def new_blob(self, blob):
 816         """Create a blob object in the pack with the supplied content."""
 817         return self.maybe_write('blob', blob)
 818
 819     def new_tree(self, shalist):
 820         """Create a tree object in the pack."""
 821         shalist = sorted(shalist, key = _shalist_sort_key)
 822         l = []
 823         for (mode,name,bin) in shalist:
 824             assert(mode)
 825             assert(mode != '0')
 826             assert(mode[0] != '0')
 827             assert(name)
 828             assert(len(bin) == 20)
 829             l.append('%s %s\0%s' % (mode,name,bin))
 830         return self.maybe_write('tree', ''.join(l))
 831
 832     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 833         l = []
 834         if tree: l.append('tree %s' % tree.encode('hex'))
 835         if parent: l.append('parent %s' % parent.encode('hex'))
 836         if author: l.append('author %s %s' % (author, _git_date(adate)))
 837         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 838         l.append('')
 839         l.append(msg)
 840         return self.maybe_write('commit', '\n'.join(l))
 841
 842     def new_commit(self, parent, tree, date, msg):
 843         """Create a commit object in the pack."""
 844         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 845         commit = self._new_commit(tree, parent,
 846                                   userline, date, userline, date,
 847                                   msg)
 848         return commit
 849
 850     def abort(self):
 851         """Remove the pack file from disk."""
 852         f = self.file
 853         if f:
 854             self.idx = None
 855             self.file = None
 856             f.close()
 857             os.unlink(self.filename + '.pack')
 858
 859     def _end(self, run_midx=True):
 860         f = self.file
 861         if not f: return None
 862         self.file = None
 863         self.objcache = None
 864         idx = self.idx
 865         self.idx = None
 866
 867         # update object count
 868         f.seek(8)
 869         cp = struct.pack('!i', self.count)
 870         assert(len(cp) == 4)
 871         f.write(cp)
 872
 873         # calculate the pack sha1sum
 874         f.seek(0)
 875         sum = Sha1()
 876         for b in chunkyreader(f):
 877             sum.update(b)
 878         packbin = sum.digest()
 879         f.write(packbin)
 880         f.close()
 881
 882         idx_f = open(self.filename + '.idx', 'wb')
 883         obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
 884         idx_f.close()
 885
 886         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 887         if os.path.exists(self.filename + '.map'):
 888             os.unlink(self.filename + '.map')
 889         os.rename(self.filename + '.pack', nameprefix + '.pack')
 890         os.rename(self.filename + '.idx', nameprefix + '.idx')
 891
 892         if run_midx:
 893             auto_midx(repo('objects/pack'))
 894         return nameprefix
 895
 896     def close(self, run_midx=True):
 897         """Close the pack file and move it to its definitive path."""
 898         return self._end(run_midx=run_midx)
 899
 900     def _write_pack_idx_v2(self, file, idx, packbin):
 901         sum = Sha1()
 902
 903         def write(data):
 904             file.write(data)
 905             sum.update(data)
 906
 907         write('\377tOc\0\0\0\2')
 908
 909         n = 0
 910         for part in idx:
 911             n += len(part)
 912             write(struct.pack('!i', n))
 913             part.sort(key=lambda x: x[0])
 914
 915         obj_list_sum = Sha1()
 916         for part in idx:
 917             for entry in part:
 918                 write(entry[0])
 919                 obj_list_sum.update(entry[0])
 920         for part in idx:
 921             for entry in part:
 922                 write(struct.pack('!I', entry[1]))
 923         ofs64_list = []
 924         for part in idx:
 925             for entry in part:
 926                 if entry[2] & 0x80000000:
 927                     write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
 928                     ofs64_list.append(struct.pack('!Q', entry[2]))
 929                 else:
 930                     write(struct.pack('!i', entry[2]))
 931         for ofs64 in ofs64_list:
 932             write(ofs64)
 933
 934         write(packbin)
 935         file.write(sum.digest())
 936         return obj_list_sum.hexdigest()
 937
 938
 939 def _git_date(date):
 940     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 941
 942
 943 def _gitenv():
 944     os.environ['GIT_DIR'] = os.path.abspath(repo())
 945
 946
 947 def list_refs(refname = None):
 948     """Generate a list of tuples in the form (refname,hash).
 949     If a ref name is specified, list only this particular ref.
 950     """
 951     argv = ['git', 'show-ref', '--']
 952     if refname:
 953         argv += [refname]
 954     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 955     out = p.stdout.read().strip()
 956     rv = p.wait()  # not fatal
 957     if rv:
 958         assert(not out)
 959     if out:
 960         for d in out.split('\n'):
 961             (sha, name) = d.split(' ', 1)
 962             yield (name, sha.decode('hex'))
 963
 964
 965 def read_ref(refname):
 966     """Get the commit id of the most recent commit made on a given ref."""
 967     l = list(list_refs(refname))
 968     if l:
 969         assert(len(l) == 1)
 970         return l[0][1]
 971     else:
 972         return None
 973
 974
 975 def rev_list(ref, count=None):
 976     """Generate a list of reachable commits in reverse chronological order.
 977
 978     This generator walks through commits, from child to parent, that are
 979     reachable via the specified ref and yields a series of tuples of the form
 980     (date,hash).
 981
 982     If count is a non-zero integer, limit the number of commits to "count"
 983     objects.
 984     """
 985     assert(not ref.startswith('-'))
 986     opts = []
 987     if count:
 988         opts += ['-n', str(atoi(count))]
 989     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 990     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 991     commit = None
 992     for row in p.stdout:
 993         s = row.strip()
 994         if s.startswith('commit '):
 995             commit = s[7:].decode('hex')
 996         else:
 997             date = int(s)
 998             yield (date, commit)
 999     rv = p.wait()  # not fatal
1000     if rv:
1001         raise GitError, 'git rev-list returned error %d' % rv
1002
1003
1004 def rev_get_date(ref):
1005     """Get the date of the latest commit on the specified ref."""
1006     for (date, commit) in rev_list(ref, count=1):
1007         return date
1008     raise GitError, 'no such commit %r' % ref
1009
1010
1011 def rev_parse(committish):
1012     """Resolve the full hash for 'committish', if it exists.
1013
1014     Should be roughly equivalent to 'git rev-parse'.
1015
1016     Returns the hex value of the hash if it is found, None if 'committish' does
1017     not correspond to anything.
1018     """
1019     head = read_ref(committish)
1020     if head:
1021         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1022         return head
1023
1024     pL = PackIdxList(repo('objects/pack'))
1025
1026     if len(committish) == 40:
1027         try:
1028             hash = committish.decode('hex')
1029         except TypeError:
1030             return None
1031
1032         if pL.exists(hash):
1033             return hash
1034
1035     return None
1036
1037
1038 def update_ref(refname, newval, oldval):
1039     """Change the commit pointed to by a branch."""
1040     if not oldval:
1041         oldval = ''
1042     assert(refname.startswith('refs/heads/'))
1043     p = subprocess.Popen(['git', 'update-ref', refname,
1044                           newval.encode('hex'), oldval.encode('hex')],
1045                          preexec_fn = _gitenv)
1046     _git_wait('git update-ref', p)
1047
1048
1049 def guess_repo(path=None):
1050     """Set the path value in the global variable "repodir".
1051     This makes bup look for an existing bup repository, but not fail if a
1052     repository doesn't exist. Usually, if you are interacting with a bup
1053     repository, you would not be calling this function but using
1054     check_repo_or_die().
1055     """
1056     global repodir
1057     if path:
1058         repodir = path
1059     if not repodir:
1060         repodir = os.environ.get('BUP_DIR')
1061         if not repodir:
1062             repodir = os.path.expanduser('~/.bup')
1063
1064
1065 def init_repo(path=None):
1066     """Create the Git bare repository for bup in a given path."""
1067     guess_repo(path)
1068     d = repo()  # appends a / to the path
1069     parent = os.path.dirname(os.path.dirname(d))
1070     if parent and not os.path.exists(parent):
1071         raise GitError('parent directory "%s" does not exist\n' % parent)
1072     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1073         raise GitError('"%d" exists but is not a directory\n' % d)
1074     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1075                          preexec_fn = _gitenv)
1076     _git_wait('git init', p)
1077     # Force the index version configuration in order to ensure bup works
1078     # regardless of the version of the installed Git binary.
1079     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1080                          stdout=sys.stderr, preexec_fn = _gitenv)
1081     _git_wait('git config', p)
1082
1083
1084 def check_repo_or_die(path=None):
1085     """Make sure a bup repository exists, and abort if not.
1086     If the path to a particular repository was not specified, this function
1087     initializes the default repository automatically.
1088     """
1089     guess_repo(path)
1090     if not os.path.isdir(repo('objects/pack/.')):
1091         if repodir == home_repodir:
1092             init_repo()
1093         else:
1094             log('error: %r is not a bup/git repository\n' % repo())
1095             sys.exit(15)
1096
1097
1098 def treeparse(buf):
1099     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1100     ofs = 0
1101     while ofs < len(buf):
1102         z = buf[ofs:].find('\0')
1103         assert(z > 0)
1104         spl = buf[ofs:ofs+z].split(' ', 1)
1105         assert(len(spl) == 2)
1106         sha = buf[ofs+z+1:ofs+z+1+20]
1107         ofs += z+1+20
1108         yield (spl[0], spl[1], sha)
1109
1110
1111 _ver = None
1112 def ver():
1113     """Get Git's version and ensure a usable version is installed.
1114
1115     The returned version is formatted as an ordered tuple with each position
1116     representing a digit in the version tag. For example, the following tuple
1117     would represent version 1.6.6.9:
1118
1119         ('1', '6', '6', '9')
1120     """
1121     global _ver
1122     if not _ver:
1123         p = subprocess.Popen(['git', '--version'],
1124                              stdout=subprocess.PIPE)
1125         gvs = p.stdout.read()
1126         _git_wait('git --version', p)
1127         m = re.match(r'git version (\S+.\S+)', gvs)
1128         if not m:
1129             raise GitError('git --version weird output: %r' % gvs)
1130         _ver = tuple(m.group(1).split('.'))
1131     needed = ('1','5', '3', '1')
1132     if _ver < needed:
1133         raise GitError('git version %s or higher is required; you have %s'
1134                        % ('.'.join(needed), '.'.join(_ver)))
1135     return _ver
1136
1137
1138 def _git_wait(cmd, p):
1139     rv = p.wait()
1140     if rv != 0:
1141         raise GitError('%s returned %d' % (cmd, rv))
1142
1143
1144 def _git_capture(argv):
1145     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1146     r = p.stdout.read()
1147     _git_wait(repr(argv), p)
1148     return r
1149
1150
1151 class _AbortableIter:
1152     def __init__(self, it, onabort = None):
1153         self.it = it
1154         self.onabort = onabort
1155         self.done = None
1156
1157     def __iter__(self):
1158         return self
1159
1160     def next(self):
1161         try:
1162             return self.it.next()
1163         except StopIteration, e:
1164             self.done = True
1165             raise
1166         except:
1167             self.abort()
1168             raise
1169
1170     def abort(self):
1171         """Abort iteration and call the abortion callback, if needed."""
1172         if not self.done:
1173             self.done = True
1174             if self.onabort:
1175                 self.onabort()
1176
1177     def __del__(self):
1178         self.abort()
1179
1180
1181 _ver_warned = 0
1182 class CatPipe:
1183     """Link to 'git cat-file' that is used to retrieve blob data."""
1184     def __init__(self):
1185         global _ver_warned
1186         wanted = ('1','5','6')
1187         if ver() < wanted:
1188             if not _ver_warned:
1189                 log('warning: git version < %s; bup will be slow.\n'
1190                     % '.'.join(wanted))
1191                 _ver_warned = 1
1192             self.get = self._slow_get
1193         else:
1194             self.p = self.inprogress = None
1195             self.get = self._fast_get
1196
1197     def _abort(self):
1198         if self.p:
1199             self.p.stdout.close()
1200             self.p.stdin.close()
1201         self.p = None
1202         self.inprogress = None
1203
1204     def _restart(self):
1205         self._abort()
1206         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1207                                   stdin=subprocess.PIPE,
1208                                   stdout=subprocess.PIPE,
1209                                   close_fds = True,
1210                                   bufsize = 4096,
1211                                   preexec_fn = _gitenv)
1212
1213     def _fast_get(self, id):
1214         if not self.p or self.p.poll() != None:
1215             self._restart()
1216         assert(self.p)
1217         assert(self.p.poll() == None)
1218         if self.inprogress:
1219             log('_fast_get: opening %r while %r is open'
1220                 % (id, self.inprogress))
1221         assert(not self.inprogress)
1222         assert(id.find('\n') < 0)
1223         assert(id.find('\r') < 0)
1224         assert(not id.startswith('-'))
1225         self.inprogress = id
1226         self.p.stdin.write('%s\n' % id)
1227         self.p.stdin.flush()
1228         hdr = self.p.stdout.readline()
1229         if hdr.endswith(' missing\n'):
1230             self.inprogress = None
1231             raise KeyError('blob %r is missing' % id)
1232         spl = hdr.split(' ')
1233         if len(spl) != 3 or len(spl[0]) != 40:
1234             raise GitError('expected blob, got %r' % spl)
1235         (hex, type, size) = spl
1236
1237         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1238                            onabort = self._abort)
1239         try:
1240             yield type
1241             for blob in it:
1242                 yield blob
1243             assert(self.p.stdout.readline() == '\n')
1244             self.inprogress = None
1245         except Exception, e:
1246             it.abort()
1247             raise
1248
1249     def _slow_get(self, id):
1250         assert(id.find('\n') < 0)
1251         assert(id.find('\r') < 0)
1252         assert(id[0] != '-')
1253         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1254         yield type
1255
1256         p = subprocess.Popen(['git', 'cat-file', type, id],
1257                              stdout=subprocess.PIPE,
1258                              preexec_fn = _gitenv)
1259         for blob in chunkyreader(p.stdout):
1260             yield blob
1261         _git_wait('git cat-file', p)
1262
1263     def _join(self, it):
1264         type = it.next()
1265         if type == 'blob':
1266             for blob in it:
1267                 yield blob
1268         elif type == 'tree':
1269             treefile = ''.join(it)
1270             for (mode, name, sha) in treeparse(treefile):
1271                 for blob in self.join(sha.encode('hex')):
1272                     yield blob
1273         elif type == 'commit':
1274             treeline = ''.join(it).split('\n')[0]
1275             assert(treeline.startswith('tree '))
1276             for blob in self.join(treeline[5:]):
1277                 yield blob
1278         else:
1279             raise GitError('invalid object type %r: expected blob/tree/commit'
1280                            % type)
1281
1282     def join(self, id):
1283         """Generate a list of the content of all blobs that can be reached
1284         from an object.  The hash given in 'id' must point to a blob, a tree
1285         or a commit. The content of all blobs that can be seen from trees or
1286         commits will be added to the list.
1287         """
1288         try:
1289             for d in self._join(self.get(id)):
1290                 yield d
1291         except StopIteration:
1292             log('booger!\n')
1293
1294 def tags():
1295     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1296     tags = {}
1297     for (n,c) in list_refs():
1298         if n.startswith('refs/tags/'):
1299             name = n[10:]
1300             if not c in tags:
1301                 tags[c] = []
1302
1303             tags[c].append(name)  # more than one tag can point at 'c'
1304
1305     return tags