lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path
   8
   9 MIDX_VERSION = 4
  10
  11 """Discussion of bloom constants for bup:
  12
  13 There are four basic things to consider when building a bloom filter:
  14 The size, in bits, of the filter
  15 The capacity, in entries, of the filter
  16 The probability of a false positive that is tolerable
  17 The number of bits readily available to use for addresing filter bits
  18
  19 There is one major tunable that is not directly related to the above:
  20 k: the number of bits set in the filter per entry
  21
  22 Here's a wall of numbers showing the relationship between k; the ratio between
  23 the filter size in bits and the entries in the filter; and pfalse_positive:
  24
  25 mn|k=3    |k=4    |k=5    |k=6    |k=7    |k=8    |k=9    |k=10   |k=11
  26  8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
  27  9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
  28 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
  29 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
  30 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
  31 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
  32 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
  33 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
  34 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
  35 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
  36 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
  37 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
  38 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
  39 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
  40 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
  41 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
  42 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
  43 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
  44 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
  45 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
  46 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
  47 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
  48 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
  49 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
  50 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
  51
  52 Here's a table showing available repository size for a given pfalse_positive
  53 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
  54 filter and 8192bytes per object):
  55
  56 pfalse|obj k=4     |cap k=4    |obj k=5  |cap k=5    |obj k=6 |cap k=6
  57 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
  58 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
  59 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
  60
  61 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
  62 addressing.
  63
  64 filter size scales linearly with repository size for a given k and pfalse.
  65
  66 Here's a table of filter sizes for a 1 TiB repository:
  67
  68 pfalse| k=3        | k=4        | k=5        | k=6
  69 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
  70 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
  71 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
  72
  73 For bup:
  74 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
  75 per lookup will be worse than the two required for midx.
  76 * We want the pfalse_positive to be low enough that the cost of sometimes
  77 faulting on the midx doesn't overcome the benefit of the bloom filter.
  78 * We have readily available 160 bits for addressing the filter.
  79 * We want to be able to have a single bloom address entire repositories of
  80 reasonable size.
  81
  82 Based on these parameters, a combination of k=4 and k=5 provides the behavior
  83 that bup needs.  As such, I've implemented bloom addressing, adding and
  84 checking functions in C for these two values.  Because k=5 requires less space
  85 and gives better overall pfalse_positive perofrmance, it is preferred if a
  86 table with k=5 can represent the repository.
  87
  88 None of this tells us what max_pfalse_positive to choose.
  89
  90 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
  91 """
  92 BLOOM_VERSION = 2
  93 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
  94 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
  95 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
  96
  97 verbose = 0
  98 ignore_midx = 0
  99 home_repodir = os.path.expanduser('~/.bup')
 100 repodir = None
 101
 102 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
 103 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
 104
 105 _total_searches = 0
 106 _total_steps = 0
 107
 108
 109 class GitError(Exception):
 110     pass
 111
 112
 113 def repo(sub = ''):
 114     """Get the path to the git repository or one of its subdirectories."""
 115     global repodir
 116     if not repodir:
 117         raise GitError('You should call check_repo_or_die()')
 118
 119     # If there's a .git subdirectory, then the actual repo is in there.
 120     gd = os.path.join(repodir, '.git')
 121     if os.path.exists(gd):
 122         repodir = gd
 123
 124     return os.path.join(repodir, sub)
 125
 126
 127 def auto_midx(objdir):
 128     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 129     try:
 130         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 131     except OSError, e:
 132         # make sure 'args' gets printed to help with debugging
 133         add_error('%r: exception: %s' % (args, e))
 134         raise
 135     if rv:
 136         add_error('%r: returned %d' % (args, rv))
 137
 138     args = [path.exe(), 'bloom', '--dir', objdir]
 139     try:
 140         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 141     except OSError, e:
 142         # make sure 'args' gets printed to help with debugging
 143         add_error('%r: exception: %s' % (args, e))
 144         raise
 145     if rv:
 146         add_error('%r: returned %d' % (args, rv))
 147
 148
 149 def mangle_name(name, mode, gitmode):
 150     """Mangle a file name to present an abstract name for segmented files.
 151     Mangled file names will have the ".bup" extension added to them. If a
 152     file's name already ends with ".bup", a ".bupl" extension is added to
 153     disambiguate normal files from semgmented ones.
 154     """
 155     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 156         return name + '.bup'
 157     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 158         return name + '.bupl'
 159     else:
 160         return name
 161
 162
 163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 164 def demangle_name(name):
 165     """Remove name mangling from a file name, if necessary.
 166
 167     The return value is a tuple (demangled_filename,mode), where mode is one of
 168     the following:
 169
 170     * BUP_NORMAL  : files that should be read as-is from the repository
 171     * BUP_CHUNKED : files that were chunked and need to be assembled
 172
 173     For more information on the name mangling algorythm, see mangle_name()
 174     """
 175     if name.endswith('.bupl'):
 176         return (name[:-5], BUP_NORMAL)
 177     elif name.endswith('.bup'):
 178         return (name[:-4], BUP_CHUNKED)
 179     else:
 180         return (name, BUP_NORMAL)
 181
 182
 183 def _encode_packobj(type, content):
 184     szout = ''
 185     sz = len(content)
 186     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 187     sz >>= 4
 188     while 1:
 189         if sz: szbits |= 0x80
 190         szout += chr(szbits)
 191         if not sz:
 192             break
 193         szbits = sz & 0x7f
 194         sz >>= 7
 195     z = zlib.compressobj(1)
 196     yield szout
 197     yield z.compress(content)
 198     yield z.flush()
 199
 200
 201 def _encode_looseobj(type, content):
 202     z = zlib.compressobj(1)
 203     yield z.compress('%s %d\0' % (type, len(content)))
 204     yield z.compress(content)
 205     yield z.flush()
 206
 207
 208 def _decode_looseobj(buf):
 209     assert(buf);
 210     s = zlib.decompress(buf)
 211     i = s.find('\0')
 212     assert(i > 0)
 213     l = s[:i].split(' ')
 214     type = l[0]
 215     sz = int(l[1])
 216     content = s[i+1:]
 217     assert(type in _typemap)
 218     assert(sz == len(content))
 219     return (type, content)
 220
 221
 222 def _decode_packobj(buf):
 223     assert(buf)
 224     c = ord(buf[0])
 225     type = _typermap[(c & 0x70) >> 4]
 226     sz = c & 0x0f
 227     shift = 4
 228     i = 0
 229     while c & 0x80:
 230         i += 1
 231         c = ord(buf[i])
 232         sz |= (c & 0x7f) << shift
 233         shift += 7
 234         if not (c & 0x80):
 235             break
 236     return (type, zlib.decompress(buf[i+1:]))
 237
 238
 239 class PackIdx:
 240     def __init__(self):
 241         assert(0)
 242
 243     def find_offset(self, hash):
 244         """Get the offset of an object inside the index file."""
 245         idx = self._idx_from_hash(hash)
 246         if idx != None:
 247             return self._ofs_from_idx(idx)
 248         return None
 249
 250     def exists(self, hash, want_source=False):
 251         """Return nonempty if the object exists in this index."""
 252         if hash and (self._idx_from_hash(hash) != None):
 253             return want_source and self.name or True
 254         return None
 255
 256     def __len__(self):
 257         return int(self.fanout[255])
 258
 259     def _idx_from_hash(self, hash):
 260         global _total_searches, _total_steps
 261         _total_searches += 1
 262         assert(len(hash) == 20)
 263         b1 = ord(hash[0])
 264         start = self.fanout[b1-1] # range -1..254
 265         end = self.fanout[b1] # range 0..255
 266         want = str(hash)
 267         _total_steps += 1  # lookup table is a step
 268         while start < end:
 269             _total_steps += 1
 270             mid = start + (end-start)/2
 271             v = self._idx_to_hash(mid)
 272             if v < want:
 273                 start = mid+1
 274             elif v > want:
 275                 end = mid
 276             else: # got it!
 277                 return mid
 278         return None
 279
 280
 281 class PackIdxV1(PackIdx):
 282     """Object representation of a Git pack index (version 1) file."""
 283     def __init__(self, filename, f):
 284         self.name = filename
 285         self.idxnames = [self.name]
 286         self.map = mmap_read(f)
 287         self.fanout = list(struct.unpack('!256I',
 288                                          str(buffer(self.map, 0, 256*4))))
 289         self.fanout.append(0)  # entry "-1"
 290         nsha = self.fanout[255]
 291         self.sha_ofs = 256*4
 292         self.shatable = buffer(self.map, self.sha_ofs, nsha*24)
 293
 294     def _ofs_from_idx(self, idx):
 295         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 296
 297     def _idx_to_hash(self, idx):
 298         return str(self.shatable[idx*24+4 : idx*24+24])
 299
 300     def __iter__(self):
 301         for i in xrange(self.fanout[255]):
 302             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 303
 304
 305 class PackIdxV2(PackIdx):
 306     """Object representation of a Git pack index (version 2) file."""
 307     def __init__(self, filename, f):
 308         self.name = filename
 309         self.idxnames = [self.name]
 310         self.map = mmap_read(f)
 311         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 312         self.fanout = list(struct.unpack('!256I',
 313                                          str(buffer(self.map, 8, 256*4))))
 314         self.fanout.append(0)  # entry "-1"
 315         nsha = self.fanout[255]
 316         self.sha_ofs = 8 + 256*4
 317         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 318         self.ofstable = buffer(self.map,
 319                                self.sha_ofs + nsha*20 + nsha*4,
 320                                nsha*4)
 321         self.ofs64table = buffer(self.map,
 322                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 323
 324     def _ofs_from_idx(self, idx):
 325         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 326         if ofs & 0x80000000:
 327             idx64 = ofs & 0x7fffffff
 328             ofs = struct.unpack('!Q',
 329                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 330         return ofs
 331
 332     def _idx_to_hash(self, idx):
 333         return str(self.shatable[idx*20:(idx+1)*20])
 334
 335     def __iter__(self):
 336         for i in xrange(self.fanout[255]):
 337             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 338
 339
 340 extract_bits = _helpers.extract_bits
 341
 342 bloom_contains = _helpers.bloom_contains
 343 bloom_add = _helpers.bloom_add
 344
 345
 346 class ShaBloom:
 347     """Wrapper which contains data from multiple index files.
 348     """
 349     def __init__(self, filename, f=None, readwrite=False):
 350         self.name = filename
 351         self.rwfile = None
 352         self.map = None
 353         assert(filename.endswith('.bloom'))
 354         if readwrite:
 355             self.rwfile = f or open(filename, 'r+b')
 356             self.map = mmap_readwrite(self.rwfile, close=False)
 357         else:
 358             self.rwfile = None
 359             self.map = mmap_read(f or open(filename, 'rb'))
 360         if str(self.map[0:4]) != 'BLOM':
 361             log('Warning: skipping: invalid BLOM header in %r\n' % filename)
 362             return self._init_failed()
 363         ver = struct.unpack('!I', self.map[4:8])[0]
 364         if ver < BLOOM_VERSION:
 365             log('Warning: ignoring old-style (v%d) bloom %r\n'
 366                 % (ver, filename))
 367             return self._init_failed()
 368         if ver > BLOOM_VERSION:
 369             log('Warning: ignoring too-new (v%d) bloom %r\n'
 370                 % (ver, filename))
 371             return self._init_failed()
 372
 373         self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
 374         idxnamestr = str(self.map[16 + 2**self.bits:])
 375         if idxnamestr:
 376             self.idxnames = idxnamestr.split('\0')
 377         else:
 378             self.idxnames = []
 379
 380     def _init_failed(self):
 381         if self.map:
 382             self.map.close()
 383             self.map = None
 384         if self.rwfile:
 385             self.rwfile.close()
 386             self.rwfile = None
 387         self.idxnames = []
 388         self.bits = self.entries = 0
 389
 390     def valid(self):
 391         return self.map and self.bits
 392
 393     def __del__(self):
 394         self.close()
 395
 396     def close(self):
 397         if self.map:
 398             if self.rwfile:
 399                 debug2("bloom: closing with %d entries\n" % self.entries)
 400                 self.map[12:16] = struct.pack('!I', self.entries)
 401                 self.map.flush()
 402         if self.rwfile:
 403             self.rwfile.seek(16 + 2**self.bits)
 404             if self.idxnames:
 405                 self.rwfile.write('\0'.join(self.idxnames))
 406         self._init_failed()
 407
 408     def pfalse_positive(self, additional=0):
 409         n = self.entries + additional
 410         m = 8*2**self.bits
 411         k = self.k
 412         return 100*(1-math.exp(-k*float(n)/m))**k
 413
 414     def add_idx(self, ix):
 415         """Add the object to the filter, return current pfalse_positive."""
 416         if not self.map: raise Exception, "Cannot add to closed bloom"
 417         self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
 418         self.idxnames.append(os.path.basename(ix.name))
 419
 420     def exists(self, sha):
 421         """Return nonempty if the object probably exists in the bloom filter."""
 422         global _total_searches, _total_steps
 423         _total_searches += 1
 424         if not self.map: return None
 425         found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
 426         _total_steps += steps
 427         return found
 428
 429     @classmethod
 430     def create(cls, name, f=None, readwrite=False, expected=100000, k=None):
 431         """Create and return a bloom filter for `expected` entries."""
 432         bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
 433         k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
 434         if bits > MAX_BLOOM_BITS[k]:
 435             log('bloom: warning, max bits exceeded, non-optimal\n')
 436             bits = MAX_BLOOM_BITS[k]
 437         debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
 438         f = f or open(name, 'w+b')
 439         f.write('BLOM')
 440         f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
 441         assert(f.tell() == 16)
 442         # NOTE: On some systems this will not extend+zerofill, but it does on
 443         # darwin, linux, bsd and solaris.
 444         f.truncate(16+2**bits)
 445         f.seek(0)
 446         return cls(name, f=f, readwrite=readwrite)
 447
 448     def __len__(self):
 449         return self.entries
 450
 451
 452 class PackMidx:
 453     """Wrapper which contains data from multiple index files.
 454     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 455     and make it possible for bup to expand Git's indexing capabilities to vast
 456     amounts of files.
 457     """
 458     def __init__(self, filename):
 459         self.name = filename
 460         self.force_keep = False
 461         assert(filename.endswith('.midx'))
 462         self.map = mmap_read(open(filename))
 463         if str(self.map[0:4]) != 'MIDX':
 464             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 465             self.force_keep = True
 466             return self._init_failed()
 467         ver = struct.unpack('!I', self.map[4:8])[0]
 468         if ver < MIDX_VERSION:
 469             log('Warning: ignoring old-style (v%d) midx %r\n'
 470                 % (ver, filename))
 471             self.force_keep = False  # old stuff is boring
 472             return self._init_failed()
 473         if ver > MIDX_VERSION:
 474             log('Warning: ignoring too-new (v%d) midx %r\n'
 475                 % (ver, filename))
 476             self.force_keep = True  # new stuff is exciting
 477             return self._init_failed()
 478
 479         self.bits = _helpers.firstword(self.map[8:12])
 480         self.entries = 2**self.bits
 481         self.fanout = buffer(self.map, 12, self.entries*4)
 482         self.sha_ofs = 12 + self.entries*4
 483         self.nsha = nsha = self._fanget(self.entries-1)
 484         self.shatable = buffer(self.map, self.sha_ofs, nsha*20)
 485         self.whichlist = buffer(self.map, self.sha_ofs + nsha*20, nsha*4)
 486         self.idxname_ofs = self.sha_ofs + 24*nsha
 487         self.idxnames = str(self.map[self.idxname_ofs:]).split('\0')
 488
 489     def _init_failed(self):
 490         self.bits = 0
 491         self.entries = 1
 492         self.fanout = buffer('\0\0\0\0')
 493         self.shatable = buffer('\0'*20)
 494         self.idxnames = []
 495
 496     def _fanget(self, i):
 497         start = i*4
 498         s = self.fanout[start:start+4]
 499         return _helpers.firstword(s)
 500
 501     def _get(self, i):
 502         return str(self.shatable[i*20:(i+1)*20])
 503
 504     def _get_idx_i(self, i):
 505         return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]
 506
 507     def _get_idxname(self, i):
 508         return self.idxnames[self._get_idx_i(i)]
 509
 510     def exists(self, hash, want_source=False):
 511         """Return nonempty if the object exists in the index files."""
 512         global _total_searches, _total_steps
 513         _total_searches += 1
 514         want = str(hash)
 515         el = extract_bits(want, self.bits)
 516         if el:
 517             start = self._fanget(el-1)
 518             startv = el << (32-self.bits)
 519         else:
 520             start = 0
 521             startv = 0
 522         end = self._fanget(el)
 523         endv = (el+1) << (32-self.bits)
 524         _total_steps += 1   # lookup table is a step
 525         hashv = _helpers.firstword(hash)
 526         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 527         while start < end:
 528             _total_steps += 1
 529             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 530             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 531             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 532             v = self._get(mid)
 533             #print '    %08x' % self._num(v)
 534             if v < want:
 535                 start = mid+1
 536                 startv = _helpers.firstword(v)
 537             elif v > want:
 538                 end = mid
 539                 endv = _helpers.firstword(v)
 540             else: # got it!
 541                 return want_source and self._get_idxname(mid) or True
 542         return None
 543
 544     def __iter__(self):
 545         for i in xrange(self._fanget(self.entries-1)):
 546             yield buffer(self.shatable, i*20, 20)
 547
 548     def __len__(self):
 549         return int(self._fanget(self.entries-1))
 550
 551
 552 _mpi_count = 0
 553 class PackIdxList:
 554     def __init__(self, dir):
 555         global _mpi_count
 556         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 557         _mpi_count += 1
 558         self.dir = dir
 559         self.also = set()
 560         self.packs = []
 561         self.do_bloom = False
 562         self.bloom = None
 563         self.refresh()
 564
 565     def __del__(self):
 566         global _mpi_count
 567         _mpi_count -= 1
 568         assert(_mpi_count == 0)
 569
 570     def __iter__(self):
 571         return iter(idxmerge(self.packs))
 572
 573     def __len__(self):
 574         return sum(len(pack) for pack in self.packs)
 575
 576     def exists(self, hash, want_source=False):
 577         """Return nonempty if the object exists in the index files."""
 578         global _total_searches
 579         _total_searches += 1
 580         if hash in self.also:
 581             return True
 582         if self.do_bloom and self.bloom is not None:
 583             _total_searches -= 1  # will be incremented by bloom
 584             if self.bloom.exists(hash):
 585                 self.do_bloom = False
 586             else:
 587                 return None
 588         for i in xrange(len(self.packs)):
 589             p = self.packs[i]
 590             _total_searches -= 1  # will be incremented by sub-pack
 591             ix = p.exists(hash, want_source=want_source)
 592             if ix:
 593                 # reorder so most recently used packs are searched first
 594                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 595                 return ix
 596         self.do_bloom = True
 597         return None
 598
 599     def refresh(self, skip_midx = False):
 600         """Refresh the index list.
 601         This method verifies if .midx files were superseded (e.g. all of its
 602         contents are in another, bigger .midx file) and removes the superseded
 603         files.
 604
 605         If skip_midx is True, all work on .midx files will be skipped and .midx
 606         files will be removed from the list.
 607
 608         The module-global variable 'ignore_midx' can force this function to
 609         always act as if skip_midx was True.
 610         """
 611         self.bloom = None # Always reopen the bloom as it may have been relaced
 612         self.do_bloom = False
 613         skip_midx = skip_midx or ignore_midx
 614         d = dict((p.name, p) for p in self.packs
 615                  if not skip_midx or not isinstance(p, PackMidx))
 616         if os.path.exists(self.dir):
 617             if not skip_midx:
 618                 midxl = []
 619                 for ix in self.packs:
 620                     if isinstance(ix, PackMidx):
 621                         for name in ix.idxnames:
 622                             d[os.path.join(self.dir, name)] = ix
 623                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 624                     if not d.get(full):
 625                         mx = PackMidx(full)
 626                         (mxd, mxf) = os.path.split(mx.name)
 627                         broken = False
 628                         for n in mx.idxnames:
 629                             if not os.path.exists(os.path.join(mxd, n)):
 630                                 log(('warning: index %s missing\n' +
 631                                     '  used by %s\n') % (n, mxf))
 632                                 broken = True
 633                         if broken:
 634                             del mx
 635                             unlink(full)
 636                         else:
 637                             midxl.append(mx)
 638                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 639                 for ix in midxl:
 640                     any_needed = False
 641                     for sub in ix.idxnames:
 642                         found = d.get(os.path.join(self.dir, sub))
 643                         if not found or isinstance(found, PackIdx):
 644                             # doesn't exist, or exists but not in a midx
 645                             any_needed = True
 646                             break
 647                     if any_needed:
 648                         d[ix.name] = ix
 649                         for name in ix.idxnames:
 650                             d[os.path.join(self.dir, name)] = ix
 651                     elif not ix.force_keep:
 652                         debug1('midx: removing redundant: %s\n'
 653                                % os.path.basename(ix.name))
 654                         unlink(ix.name)
 655             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 656                 if not d.get(full):
 657                     try:
 658                         ix = open_idx(full)
 659                     except GitError, e:
 660                         add_error(e)
 661                         continue
 662                     d[full] = ix
 663             bfull = os.path.join(self.dir, 'bup.bloom')
 664             if self.bloom is None and os.path.exists(bfull):
 665                 self.bloom = ShaBloom(bfull)
 666             self.packs = list(set(d.values()))
 667             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 668             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 669                 self.do_bloom = True
 670             else:
 671                 self.bloom = None
 672         debug1('PackIdxList: using %d index%s.\n'
 673             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 674
 675     def add(self, hash):
 676         """Insert an additional object in the list."""
 677         self.also.add(hash)
 678
 679
 680 def calc_hash(type, content):
 681     """Calculate some content's hash in the Git fashion."""
 682     header = '%s %d\0' % (type, len(content))
 683     sum = Sha1(header)
 684     sum.update(content)
 685     return sum.digest()
 686
 687
 688 def _shalist_sort_key(ent):
 689     (mode, name, id) = ent
 690     if stat.S_ISDIR(int(mode, 8)):
 691         return name + '/'
 692     else:
 693         return name
 694
 695
 696 def open_idx(filename):
 697     if filename.endswith('.idx'):
 698         f = open(filename, 'rb')
 699         header = f.read(8)
 700         if header[0:4] == '\377tOc':
 701             version = struct.unpack('!I', header[4:8])[0]
 702             if version == 2:
 703                 return PackIdxV2(filename, f)
 704             else:
 705                 raise GitError('%s: expected idx file version 2, got %d'
 706                                % (filename, version))
 707         elif len(header) == 8 and header[0:4] < '\377tOc':
 708             return PackIdxV1(filename, f)
 709         else:
 710             raise GitError('%s: unrecognized idx file header' % filename)
 711     elif filename.endswith('.midx'):
 712         return PackMidx(filename)
 713     else:
 714         raise GitError('idx filenames must end with .idx or .midx')
 715
 716
 717 def idxmerge(idxlist, final_progress=True):
 718     """Generate a list of all the objects reachable in a PackIdxList."""
 719     def pfunc(count, total):
 720         progress('Reading indexes: %.2f%% (%d/%d)\r'
 721                  % (count*100.0/total, count, total))
 722     def pfinal(count, total):
 723         if final_progress:
 724             log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 725     return merge_iter(idxlist, 10024, pfunc, pfinal)
 726
 727
 728 def _make_objcache():
 729     return PackIdxList(repo('objects/pack'))
 730
 731 class PackWriter:
 732     """Writes Git objects insid a pack file."""
 733     def __init__(self, objcache_maker=_make_objcache):
 734         self.count = 0
 735         self.outbytes = 0
 736         self.filename = None
 737         self.file = None
 738         self.idx = None
 739         self.objcache_maker = objcache_maker
 740         self.objcache = None
 741
 742     def __del__(self):
 743         self.close()
 744
 745     def _open(self):
 746         if not self.file:
 747             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 748             self.file = os.fdopen(fd, 'w+b')
 749             assert(name.endswith('.pack'))
 750             self.filename = name[:-5]
 751             self.file.write('PACK\0\0\0\2\0\0\0\0')
 752             self.idx = list(list() for i in xrange(256))
 753
 754     # the 'sha' parameter is used in client.py's _raw_write(), but not needed
 755     # in this basic version.
 756     def _raw_write(self, datalist, sha):
 757         self._open()
 758         f = self.file
 759         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 760         # the file never has a *partial* blob.  So let's make sure it's
 761         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 762         # to our hashsplit algorithm.)  f.write() does its own buffering,
 763         # but that's okay because we'll flush it in _end().
 764         oneblob = ''.join(datalist)
 765         try:
 766             f.write(oneblob)
 767         except IOError, e:
 768             raise GitError, e, sys.exc_info()[2]
 769         nw = len(oneblob)
 770         crc = zlib.crc32(oneblob) & 0xffffffff
 771         self._update_idx(sha, crc, nw)
 772         self.outbytes += nw
 773         self.count += 1
 774         return nw, crc
 775
 776     def _update_idx(self, sha, crc, size):
 777         assert(sha)
 778         if self.idx:
 779             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 780
 781     def _write(self, sha, type, content):
 782         if verbose:
 783             log('>')
 784         if not sha:
 785             sha = calc_hash(type, content)
 786         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 787         return sha
 788
 789     def breakpoint(self):
 790         """Clear byte and object counts and return the last processed id."""
 791         id = self._end()
 792         self.outbytes = self.count = 0
 793         return id
 794
 795     def _require_objcache(self):
 796         if self.objcache is None and self.objcache_maker:
 797             self.objcache = self.objcache_maker()
 798         if self.objcache is None:
 799             raise GitError(
 800                     "PackWriter not opened or can't check exists w/o objcache")
 801
 802     def exists(self, id, want_source=False):
 803         """Return non-empty if an object is found in the object cache."""
 804         self._require_objcache()
 805         return self.objcache.exists(id, want_source=want_source)
 806
 807     def maybe_write(self, type, content):
 808         """Write an object to the pack file if not present and return its id."""
 809         self._require_objcache()
 810         sha = calc_hash(type, content)
 811         if not self.exists(sha):
 812             self._write(sha, type, content)
 813             self.objcache.add(sha)
 814         return sha
 815
 816     def new_blob(self, blob):
 817         """Create a blob object in the pack with the supplied content."""
 818         return self.maybe_write('blob', blob)
 819
 820     def new_tree(self, shalist):
 821         """Create a tree object in the pack."""
 822         shalist = sorted(shalist, key = _shalist_sort_key)
 823         l = []
 824         for (mode,name,bin) in shalist:
 825             assert(mode)
 826             assert(mode != '0')
 827             assert(mode[0] != '0')
 828             assert(name)
 829             assert(len(bin) == 20)
 830             l.append('%s %s\0%s' % (mode,name,bin))
 831         return self.maybe_write('tree', ''.join(l))
 832
 833     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 834         l = []
 835         if tree: l.append('tree %s' % tree.encode('hex'))
 836         if parent: l.append('parent %s' % parent.encode('hex'))
 837         if author: l.append('author %s %s' % (author, _git_date(adate)))
 838         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 839         l.append('')
 840         l.append(msg)
 841         return self.maybe_write('commit', '\n'.join(l))
 842
 843     def new_commit(self, parent, tree, date, msg):
 844         """Create a commit object in the pack."""
 845         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 846         commit = self._new_commit(tree, parent,
 847                                   userline, date, userline, date,
 848                                   msg)
 849         return commit
 850
 851     def abort(self):
 852         """Remove the pack file from disk."""
 853         f = self.file
 854         if f:
 855             self.idx = None
 856             self.file = None
 857             f.close()
 858             os.unlink(self.filename + '.pack')
 859
 860     def _end(self, run_midx=True):
 861         f = self.file
 862         if not f: return None
 863         self.file = None
 864         self.objcache = None
 865         idx = self.idx
 866         self.idx = None
 867
 868         # update object count
 869         f.seek(8)
 870         cp = struct.pack('!i', self.count)
 871         assert(len(cp) == 4)
 872         f.write(cp)
 873
 874         # calculate the pack sha1sum
 875         f.seek(0)
 876         sum = Sha1()
 877         for b in chunkyreader(f):
 878             sum.update(b)
 879         packbin = sum.digest()
 880         f.write(packbin)
 881         f.close()
 882
 883         idx_f = open(self.filename + '.idx', 'wb')
 884         obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
 885         idx_f.close()
 886
 887         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 888         if os.path.exists(self.filename + '.map'):
 889             os.unlink(self.filename + '.map')
 890         os.rename(self.filename + '.pack', nameprefix + '.pack')
 891         os.rename(self.filename + '.idx', nameprefix + '.idx')
 892
 893         if run_midx:
 894             auto_midx(repo('objects/pack'))
 895         return nameprefix
 896
 897     def close(self, run_midx=True):
 898         """Close the pack file and move it to its definitive path."""
 899         return self._end(run_midx=run_midx)
 900
 901     def _write_pack_idx_v2(self, file, idx, packbin):
 902         sum = Sha1()
 903
 904         def write(data):
 905             file.write(data)
 906             sum.update(data)
 907
 908         write('\377tOc\0\0\0\2')
 909
 910         n = 0
 911         for part in idx:
 912             n += len(part)
 913             write(struct.pack('!i', n))
 914             part.sort(key=lambda x: x[0])
 915
 916         obj_list_sum = Sha1()
 917         for part in idx:
 918             for entry in part:
 919                 write(entry[0])
 920                 obj_list_sum.update(entry[0])
 921         for part in idx:
 922             for entry in part:
 923                 write(struct.pack('!I', entry[1]))
 924         ofs64_list = []
 925         for part in idx:
 926             for entry in part:
 927                 if entry[2] & 0x80000000:
 928                     write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
 929                     ofs64_list.append(struct.pack('!Q', entry[2]))
 930                 else:
 931                     write(struct.pack('!i', entry[2]))
 932         for ofs64 in ofs64_list:
 933             write(ofs64)
 934
 935         write(packbin)
 936         file.write(sum.digest())
 937         return obj_list_sum.hexdigest()
 938
 939
 940 def _git_date(date):
 941     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 942
 943
 944 def _gitenv():
 945     os.environ['GIT_DIR'] = os.path.abspath(repo())
 946
 947
 948 def list_refs(refname = None):
 949     """Generate a list of tuples in the form (refname,hash).
 950     If a ref name is specified, list only this particular ref.
 951     """
 952     argv = ['git', 'show-ref', '--']
 953     if refname:
 954         argv += [refname]
 955     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 956     out = p.stdout.read().strip()
 957     rv = p.wait()  # not fatal
 958     if rv:
 959         assert(not out)
 960     if out:
 961         for d in out.split('\n'):
 962             (sha, name) = d.split(' ', 1)
 963             yield (name, sha.decode('hex'))
 964
 965
 966 def read_ref(refname):
 967     """Get the commit id of the most recent commit made on a given ref."""
 968     l = list(list_refs(refname))
 969     if l:
 970         assert(len(l) == 1)
 971         return l[0][1]
 972     else:
 973         return None
 974
 975
 976 def rev_list(ref, count=None):
 977     """Generate a list of reachable commits in reverse chronological order.
 978
 979     This generator walks through commits, from child to parent, that are
 980     reachable via the specified ref and yields a series of tuples of the form
 981     (date,hash).
 982
 983     If count is a non-zero integer, limit the number of commits to "count"
 984     objects.
 985     """
 986     assert(not ref.startswith('-'))
 987     opts = []
 988     if count:
 989         opts += ['-n', str(atoi(count))]
 990     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 991     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 992     commit = None
 993     for row in p.stdout:
 994         s = row.strip()
 995         if s.startswith('commit '):
 996             commit = s[7:].decode('hex')
 997         else:
 998             date = int(s)
 999             yield (date, commit)
1000     rv = p.wait()  # not fatal
1001     if rv:
1002         raise GitError, 'git rev-list returned error %d' % rv
1003
1004
1005 def rev_get_date(ref):
1006     """Get the date of the latest commit on the specified ref."""
1007     for (date, commit) in rev_list(ref, count=1):
1008         return date
1009     raise GitError, 'no such commit %r' % ref
1010
1011
1012 def rev_parse(committish):
1013     """Resolve the full hash for 'committish', if it exists.
1014
1015     Should be roughly equivalent to 'git rev-parse'.
1016
1017     Returns the hex value of the hash if it is found, None if 'committish' does
1018     not correspond to anything.
1019     """
1020     head = read_ref(committish)
1021     if head:
1022         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1023         return head
1024
1025     pL = PackIdxList(repo('objects/pack'))
1026
1027     if len(committish) == 40:
1028         try:
1029             hash = committish.decode('hex')
1030         except TypeError:
1031             return None
1032
1033         if pL.exists(hash):
1034             return hash
1035
1036     return None
1037
1038
1039 def update_ref(refname, newval, oldval):
1040     """Change the commit pointed to by a branch."""
1041     if not oldval:
1042         oldval = ''
1043     assert(refname.startswith('refs/heads/'))
1044     p = subprocess.Popen(['git', 'update-ref', refname,
1045                           newval.encode('hex'), oldval.encode('hex')],
1046                          preexec_fn = _gitenv)
1047     _git_wait('git update-ref', p)
1048
1049
1050 def guess_repo(path=None):
1051     """Set the path value in the global variable "repodir".
1052     This makes bup look for an existing bup repository, but not fail if a
1053     repository doesn't exist. Usually, if you are interacting with a bup
1054     repository, you would not be calling this function but using
1055     check_repo_or_die().
1056     """
1057     global repodir
1058     if path:
1059         repodir = path
1060     if not repodir:
1061         repodir = os.environ.get('BUP_DIR')
1062         if not repodir:
1063             repodir = os.path.expanduser('~/.bup')
1064
1065
1066 def init_repo(path=None):
1067     """Create the Git bare repository for bup in a given path."""
1068     guess_repo(path)
1069     d = repo()  # appends a / to the path
1070     parent = os.path.dirname(os.path.dirname(d))
1071     if parent and not os.path.exists(parent):
1072         raise GitError('parent directory "%s" does not exist\n' % parent)
1073     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1074         raise GitError('"%d" exists but is not a directory\n' % d)
1075     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1076                          preexec_fn = _gitenv)
1077     _git_wait('git init', p)
1078     # Force the index version configuration in order to ensure bup works
1079     # regardless of the version of the installed Git binary.
1080     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1081                          stdout=sys.stderr, preexec_fn = _gitenv)
1082     _git_wait('git config', p)
1083
1084
1085 def check_repo_or_die(path=None):
1086     """Make sure a bup repository exists, and abort if not.
1087     If the path to a particular repository was not specified, this function
1088     initializes the default repository automatically.
1089     """
1090     guess_repo(path)
1091     if not os.path.isdir(repo('objects/pack/.')):
1092         if repodir == home_repodir:
1093             init_repo()
1094         else:
1095             log('error: %r is not a bup/git repository\n' % repo())
1096             sys.exit(15)
1097
1098
1099 def treeparse(buf):
1100     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1101     ofs = 0
1102     while ofs < len(buf):
1103         z = buf[ofs:].find('\0')
1104         assert(z > 0)
1105         spl = buf[ofs:ofs+z].split(' ', 1)
1106         assert(len(spl) == 2)
1107         sha = buf[ofs+z+1:ofs+z+1+20]
1108         ofs += z+1+20
1109         yield (spl[0], spl[1], sha)
1110
1111
1112 _ver = None
1113 def ver():
1114     """Get Git's version and ensure a usable version is installed.
1115
1116     The returned version is formatted as an ordered tuple with each position
1117     representing a digit in the version tag. For example, the following tuple
1118     would represent version 1.6.6.9:
1119
1120         ('1', '6', '6', '9')
1121     """
1122     global _ver
1123     if not _ver:
1124         p = subprocess.Popen(['git', '--version'],
1125                              stdout=subprocess.PIPE)
1126         gvs = p.stdout.read()
1127         _git_wait('git --version', p)
1128         m = re.match(r'git version (\S+.\S+)', gvs)
1129         if not m:
1130             raise GitError('git --version weird output: %r' % gvs)
1131         _ver = tuple(m.group(1).split('.'))
1132     needed = ('1','5', '3', '1')
1133     if _ver < needed:
1134         raise GitError('git version %s or higher is required; you have %s'
1135                        % ('.'.join(needed), '.'.join(_ver)))
1136     return _ver
1137
1138
1139 def _git_wait(cmd, p):
1140     rv = p.wait()
1141     if rv != 0:
1142         raise GitError('%s returned %d' % (cmd, rv))
1143
1144
1145 def _git_capture(argv):
1146     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1147     r = p.stdout.read()
1148     _git_wait(repr(argv), p)
1149     return r
1150
1151
1152 class _AbortableIter:
1153     def __init__(self, it, onabort = None):
1154         self.it = it
1155         self.onabort = onabort
1156         self.done = None
1157
1158     def __iter__(self):
1159         return self
1160
1161     def next(self):
1162         try:
1163             return self.it.next()
1164         except StopIteration, e:
1165             self.done = True
1166             raise
1167         except:
1168             self.abort()
1169             raise
1170
1171     def abort(self):
1172         """Abort iteration and call the abortion callback, if needed."""
1173         if not self.done:
1174             self.done = True
1175             if self.onabort:
1176                 self.onabort()
1177
1178     def __del__(self):
1179         self.abort()
1180
1181
1182 _ver_warned = 0
1183 class CatPipe:
1184     """Link to 'git cat-file' that is used to retrieve blob data."""
1185     def __init__(self):
1186         global _ver_warned
1187         wanted = ('1','5','6')
1188         if ver() < wanted:
1189             if not _ver_warned:
1190                 log('warning: git version < %s; bup will be slow.\n'
1191                     % '.'.join(wanted))
1192                 _ver_warned = 1
1193             self.get = self._slow_get
1194         else:
1195             self.p = self.inprogress = None
1196             self.get = self._fast_get
1197
1198     def _abort(self):
1199         if self.p:
1200             self.p.stdout.close()
1201             self.p.stdin.close()
1202         self.p = None
1203         self.inprogress = None
1204
1205     def _restart(self):
1206         self._abort()
1207         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1208                                   stdin=subprocess.PIPE,
1209                                   stdout=subprocess.PIPE,
1210                                   close_fds = True,
1211                                   bufsize = 4096,
1212                                   preexec_fn = _gitenv)
1213
1214     def _fast_get(self, id):
1215         if not self.p or self.p.poll() != None:
1216             self._restart()
1217         assert(self.p)
1218         assert(self.p.poll() == None)
1219         if self.inprogress:
1220             log('_fast_get: opening %r while %r is open'
1221                 % (id, self.inprogress))
1222         assert(not self.inprogress)
1223         assert(id.find('\n') < 0)
1224         assert(id.find('\r') < 0)
1225         assert(not id.startswith('-'))
1226         self.inprogress = id
1227         self.p.stdin.write('%s\n' % id)
1228         self.p.stdin.flush()
1229         hdr = self.p.stdout.readline()
1230         if hdr.endswith(' missing\n'):
1231             self.inprogress = None
1232             raise KeyError('blob %r is missing' % id)
1233         spl = hdr.split(' ')
1234         if len(spl) != 3 or len(spl[0]) != 40:
1235             raise GitError('expected blob, got %r' % spl)
1236         (hex, type, size) = spl
1237
1238         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1239                            onabort = self._abort)
1240         try:
1241             yield type
1242             for blob in it:
1243                 yield blob
1244             assert(self.p.stdout.readline() == '\n')
1245             self.inprogress = None
1246         except Exception, e:
1247             it.abort()
1248             raise
1249
1250     def _slow_get(self, id):
1251         assert(id.find('\n') < 0)
1252         assert(id.find('\r') < 0)
1253         assert(id[0] != '-')
1254         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1255         yield type
1256
1257         p = subprocess.Popen(['git', 'cat-file', type, id],
1258                              stdout=subprocess.PIPE,
1259                              preexec_fn = _gitenv)
1260         for blob in chunkyreader(p.stdout):
1261             yield blob
1262         _git_wait('git cat-file', p)
1263
1264     def _join(self, it):
1265         type = it.next()
1266         if type == 'blob':
1267             for blob in it:
1268                 yield blob
1269         elif type == 'tree':
1270             treefile = ''.join(it)
1271             for (mode, name, sha) in treeparse(treefile):
1272                 for blob in self.join(sha.encode('hex')):
1273                     yield blob
1274         elif type == 'commit':
1275             treeline = ''.join(it).split('\n')[0]
1276             assert(treeline.startswith('tree '))
1277             for blob in self.join(treeline[5:]):
1278                 yield blob
1279         else:
1280             raise GitError('invalid object type %r: expected blob/tree/commit'
1281                            % type)
1282
1283     def join(self, id):
1284         """Generate a list of the content of all blobs that can be reached
1285         from an object.  The hash given in 'id' must point to a blob, a tree
1286         or a commit. The content of all blobs that can be seen from trees or
1287         commits will be added to the list.
1288         """
1289         try:
1290             for d in self._join(self.get(id)):
1291                 yield d
1292         except StopIteration:
1293             log('booger!\n')
1294
1295 def tags():
1296     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1297     tags = {}
1298     for (n,c) in list_refs():
1299         if n.startswith('refs/tags/'):
1300             name = n[10:]
1301             if not c in tags:
1302                 tags[c] = []
1303
1304             tags[c].append(name)  # more than one tag can point at 'c'
1305
1306     return tags