lib/bup/git.py

   1 """Git interaction library.
   2 bup repositories are in Git format. This library allows us to
   3 interact with the Git data structures.
   4 """
   5 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, math, glob
   6 from bup.helpers import *
   7 from bup import _helpers, path
   8
   9 MIDX_VERSION = 2
  10
  11 """Discussion of bloom constants for bup:
  12
  13 There are four basic things to consider when building a bloom filter:
  14 The size, in bits, of the filter
  15 The capacity, in entries, of the filter
  16 The probability of a false positive that is tolerable
  17 The number of bits readily available to use for addresing filter bits
  18
  19 There is one major tunable that is not directly related to the above:
  20 k: the number of bits set in the filter per entry
  21
  22 Here's a wall of numbers showing the relationship between k; the ratio between
  23 the filter size in bits and the entries in the filter; and pfalse_positive:
  24
  25 mn|k=3    |k=4    |k=5    |k=6    |k=7    |k=8    |k=9    |k=10   |k=11
  26  8|3.05794|2.39687|2.16792|2.15771|2.29297|2.54917|2.92244|3.41909|4.05091
  27  9|2.27780|1.65770|1.40703|1.32721|1.34892|1.44631|1.61138|1.84491|2.15259
  28 10|1.74106|1.18133|0.94309|0.84362|0.81937|0.84555|0.91270|1.01859|1.16495
  29 11|1.36005|0.86373|0.65018|0.55222|0.51259|0.50864|0.53098|0.57616|0.64387
  30 12|1.08231|0.64568|0.45945|0.37108|0.32939|0.31424|0.31695|0.33387|0.36380
  31 13|0.87517|0.49210|0.33183|0.25527|0.21689|0.19897|0.19384|0.19804|0.21013
  32 14|0.71759|0.38147|0.24433|0.17934|0.14601|0.12887|0.12127|0.12012|0.12399
  33 15|0.59562|0.30019|0.18303|0.12840|0.10028|0.08523|0.07749|0.07440|0.07468
  34 16|0.49977|0.23941|0.13925|0.09351|0.07015|0.05745|0.05049|0.04700|0.04587
  35 17|0.42340|0.19323|0.10742|0.06916|0.04990|0.03941|0.03350|0.03024|0.02870
  36 18|0.36181|0.15765|0.08392|0.05188|0.03604|0.02748|0.02260|0.01980|0.01827
  37 19|0.31160|0.12989|0.06632|0.03942|0.02640|0.01945|0.01549|0.01317|0.01182
  38 20|0.27026|0.10797|0.05296|0.03031|0.01959|0.01396|0.01077|0.00889|0.00777
  39 21|0.23591|0.09048|0.04269|0.02356|0.01471|0.01014|0.00759|0.00609|0.00518
  40 22|0.20714|0.07639|0.03473|0.01850|0.01117|0.00746|0.00542|0.00423|0.00350
  41 23|0.18287|0.06493|0.02847|0.01466|0.00856|0.00555|0.00392|0.00297|0.00240
  42 24|0.16224|0.05554|0.02352|0.01171|0.00663|0.00417|0.00286|0.00211|0.00166
  43 25|0.14459|0.04779|0.01957|0.00944|0.00518|0.00316|0.00211|0.00152|0.00116
  44 26|0.12942|0.04135|0.01639|0.00766|0.00408|0.00242|0.00157|0.00110|0.00082
  45 27|0.11629|0.03595|0.01381|0.00626|0.00324|0.00187|0.00118|0.00081|0.00059
  46 28|0.10489|0.03141|0.01170|0.00515|0.00259|0.00146|0.00090|0.00060|0.00043
  47 29|0.09492|0.02756|0.00996|0.00426|0.00209|0.00114|0.00069|0.00045|0.00031
  48 30|0.08618|0.02428|0.00853|0.00355|0.00169|0.00090|0.00053|0.00034|0.00023
  49 31|0.07848|0.02147|0.00733|0.00297|0.00138|0.00072|0.00041|0.00025|0.00017
  50 32|0.07167|0.01906|0.00633|0.00250|0.00113|0.00057|0.00032|0.00019|0.00013
  51
  52 Here's a table showing available repository size for a given pfalse_positive
  53 and three values of k (assuming we only use the 160 bit SHA1 for addressing the
  54 filter and 8192bytes per object):
  55
  56 pfalse|obj k=4     |cap k=4    |obj k=5  |cap k=5    |obj k=6 |cap k=6
  57 2.500%|139333497228|1038.11 TiB|558711157|4262.63 GiB|13815755|105.41 GiB
  58 1.000%|104489450934| 778.50 TiB|436090254|3327.10 GiB|11077519| 84.51 GiB
  59 0.125%| 57254889824| 426.58 TiB|261732190|1996.86 GiB| 7063017| 55.89 GiB
  60
  61 This eliminates pretty neatly any k>6 as long as we use the raw SHA for
  62 addressing.
  63
  64 filter size scales linearly with repository size for a given k and pfalse.
  65
  66 Here's a table of filter sizes for a 1 TiB repository:
  67
  68 pfalse| k=3        | k=4        | k=5        | k=6
  69 2.500%| 138.78 MiB | 126.26 MiB | 123.00 MiB | 123.37 MiB
  70 1.000%| 197.83 MiB | 168.36 MiB | 157.58 MiB | 153.87 MiB
  71 0.125%| 421.14 MiB | 307.26 MiB | 262.56 MiB | 241.32 MiB
  72
  73 For bup:
  74 * We want the bloom filter to fit in memory; if it doesn't, the k pagefaults
  75 per lookup will be worse than the two required for midx.
  76 * We want the pfalse_positive to be low enough that the cost of sometimes
  77 faulting on the midx doesn't overcome the benefit of the bloom filter.
  78 * We have readily available 160 bits for addressing the filter.
  79 * We want to be able to have a single bloom address entire repositories of
  80 reasonable size.
  81
  82 Based on these parameters, a combination of k=4 and k=5 provides the behavior
  83 that bup needs.  As such, I've implemented bloom addressing, adding and
  84 checking functions in C for these two values.  Because k=5 requires less space
  85 and gives better overall pfalse_positive perofrmance, it is preferred if a
  86 table with k=5 can represent the repository.
  87
  88 None of this tells us what max_pfalse_positive to choose.
  89
  90 Brandon Low <lostlogic@lostlogicx.com> 04-02-2011
  91 """
  92 BLOOM_VERSION = 2
  93 MAX_BITS_EACH = 32 # Kinda arbitrary, but 4 bytes per entry is pretty big
  94 MAX_BLOOM_BITS = {4: 37, 5: 29} # 160/k-log2(8)
  95 MAX_PFALSE_POSITIVE = 1. # Totally arbitrary, needs benchmarking
  96
  97 verbose = 0
  98 ignore_midx = 0
  99 home_repodir = os.path.expanduser('~/.bup')
 100 repodir = None
 101
 102 _typemap =  { 'blob':3, 'tree':2, 'commit':1, 'tag':4 }
 103 _typermap = { 3:'blob', 2:'tree', 1:'commit', 4:'tag' }
 104
 105 _total_searches = 0
 106 _total_steps = 0
 107
 108
 109 class GitError(Exception):
 110     pass
 111
 112
 113 def repo(sub = ''):
 114     """Get the path to the git repository or one of its subdirectories."""
 115     global repodir
 116     if not repodir:
 117         raise GitError('You should call check_repo_or_die()')
 118
 119     # If there's a .git subdirectory, then the actual repo is in there.
 120     gd = os.path.join(repodir, '.git')
 121     if os.path.exists(gd):
 122         repodir = gd
 123
 124     return os.path.join(repodir, sub)
 125
 126
 127 def auto_midx(objdir):
 128     args = [path.exe(), 'midx', '--auto', '--dir', objdir]
 129     try:
 130         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 131     except OSError, e:
 132         # make sure 'args' gets printed to help with debugging
 133         add_error('%r: exception: %s' % (args, e))
 134         raise
 135     if rv:
 136         add_error('%r: returned %d' % (args, rv))
 137
 138     args = [path.exe(), 'bloom', '--dir', objdir]
 139     try:
 140         rv = subprocess.call(args, stdout=open('/dev/null', 'w'))
 141     except OSError, e:
 142         # make sure 'args' gets printed to help with debugging
 143         add_error('%r: exception: %s' % (args, e))
 144         raise
 145     if rv:
 146         add_error('%r: returned %d' % (args, rv))
 147
 148
 149 def mangle_name(name, mode, gitmode):
 150     """Mangle a file name to present an abstract name for segmented files.
 151     Mangled file names will have the ".bup" extension added to them. If a
 152     file's name already ends with ".bup", a ".bupl" extension is added to
 153     disambiguate normal files from semgmented ones.
 154     """
 155     if stat.S_ISREG(mode) and not stat.S_ISREG(gitmode):
 156         return name + '.bup'
 157     elif name.endswith('.bup') or name[:-1].endswith('.bup'):
 158         return name + '.bupl'
 159     else:
 160         return name
 161
 162
 163 (BUP_NORMAL, BUP_CHUNKED) = (0,1)
 164 def demangle_name(name):
 165     """Remove name mangling from a file name, if necessary.
 166
 167     The return value is a tuple (demangled_filename,mode), where mode is one of
 168     the following:
 169
 170     * BUP_NORMAL  : files that should be read as-is from the repository
 171     * BUP_CHUNKED : files that were chunked and need to be assembled
 172
 173     For more information on the name mangling algorythm, see mangle_name()
 174     """
 175     if name.endswith('.bupl'):
 176         return (name[:-5], BUP_NORMAL)
 177     elif name.endswith('.bup'):
 178         return (name[:-4], BUP_CHUNKED)
 179     else:
 180         return (name, BUP_NORMAL)
 181
 182
 183 def _encode_packobj(type, content):
 184     szout = ''
 185     sz = len(content)
 186     szbits = (sz & 0x0f) | (_typemap[type]<<4)
 187     sz >>= 4
 188     while 1:
 189         if sz: szbits |= 0x80
 190         szout += chr(szbits)
 191         if not sz:
 192             break
 193         szbits = sz & 0x7f
 194         sz >>= 7
 195     z = zlib.compressobj(1)
 196     yield szout
 197     yield z.compress(content)
 198     yield z.flush()
 199
 200
 201 def _encode_looseobj(type, content):
 202     z = zlib.compressobj(1)
 203     yield z.compress('%s %d\0' % (type, len(content)))
 204     yield z.compress(content)
 205     yield z.flush()
 206
 207
 208 def _decode_looseobj(buf):
 209     assert(buf);
 210     s = zlib.decompress(buf)
 211     i = s.find('\0')
 212     assert(i > 0)
 213     l = s[:i].split(' ')
 214     type = l[0]
 215     sz = int(l[1])
 216     content = s[i+1:]
 217     assert(type in _typemap)
 218     assert(sz == len(content))
 219     return (type, content)
 220
 221
 222 def _decode_packobj(buf):
 223     assert(buf)
 224     c = ord(buf[0])
 225     type = _typermap[(c & 0x70) >> 4]
 226     sz = c & 0x0f
 227     shift = 4
 228     i = 0
 229     while c & 0x80:
 230         i += 1
 231         c = ord(buf[i])
 232         sz |= (c & 0x7f) << shift
 233         shift += 7
 234         if not (c & 0x80):
 235             break
 236     return (type, zlib.decompress(buf[i+1:]))
 237
 238
 239 class PackIdx:
 240     def __init__(self):
 241         assert(0)
 242
 243     def find_offset(self, hash):
 244         """Get the offset of an object inside the index file."""
 245         idx = self._idx_from_hash(hash)
 246         if idx != None:
 247             return self._ofs_from_idx(idx)
 248         return None
 249
 250     def exists(self, hash):
 251         """Return nonempty if the object exists in this index."""
 252         return hash and (self._idx_from_hash(hash) != None) and True or None
 253
 254     def __len__(self):
 255         return int(self.fanout[255])
 256
 257     def _idx_from_hash(self, hash):
 258         global _total_searches, _total_steps
 259         _total_searches += 1
 260         assert(len(hash) == 20)
 261         b1 = ord(hash[0])
 262         start = self.fanout[b1-1] # range -1..254
 263         end = self.fanout[b1] # range 0..255
 264         want = str(hash)
 265         _total_steps += 1  # lookup table is a step
 266         while start < end:
 267             _total_steps += 1
 268             mid = start + (end-start)/2
 269             v = self._idx_to_hash(mid)
 270             if v < want:
 271                 start = mid+1
 272             elif v > want:
 273                 end = mid
 274             else: # got it!
 275                 return mid
 276         return None
 277
 278
 279 class PackIdxV1(PackIdx):
 280     """Object representation of a Git pack index (version 1) file."""
 281     def __init__(self, filename, f):
 282         self.name = filename
 283         self.idxnames = [self.name]
 284         self.map = mmap_read(f)
 285         self.fanout = list(struct.unpack('!256I',
 286                                          str(buffer(self.map, 0, 256*4))))
 287         self.fanout.append(0)  # entry "-1"
 288         nsha = self.fanout[255]
 289         self.shatable = buffer(self.map, 256*4, nsha*24)
 290
 291     def _ofs_from_idx(self, idx):
 292         return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
 293
 294     def _idx_to_hash(self, idx):
 295         return str(self.shatable[idx*24+4 : idx*24+24])
 296
 297     def __iter__(self):
 298         for i in xrange(self.fanout[255]):
 299             yield buffer(self.map, 256*4 + 24*i + 4, 20)
 300
 301
 302 class PackIdxV2(PackIdx):
 303     """Object representation of a Git pack index (version 2) file."""
 304     def __init__(self, filename, f):
 305         self.name = filename
 306         self.idxnames = [self.name]
 307         self.map = mmap_read(f)
 308         assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
 309         self.fanout = list(struct.unpack('!256I',
 310                                          str(buffer(self.map, 8, 256*4))))
 311         self.fanout.append(0)  # entry "-1"
 312         nsha = self.fanout[255]
 313         self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
 314         self.ofstable = buffer(self.map,
 315                                8 + 256*4 + nsha*20 + nsha*4,
 316                                nsha*4)
 317         self.ofs64table = buffer(self.map,
 318                                  8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
 319
 320     def _ofs_from_idx(self, idx):
 321         ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
 322         if ofs & 0x80000000:
 323             idx64 = ofs & 0x7fffffff
 324             ofs = struct.unpack('!Q',
 325                                 str(buffer(self.ofs64table, idx64*8, 8)))[0]
 326         return ofs
 327
 328     def _idx_to_hash(self, idx):
 329         return str(self.shatable[idx*20:(idx+1)*20])
 330
 331     def __iter__(self):
 332         for i in xrange(self.fanout[255]):
 333             yield buffer(self.map, 8 + 256*4 + 20*i, 20)
 334
 335
 336 extract_bits = _helpers.extract_bits
 337
 338 bloom_contains = _helpers.bloom_contains
 339 bloom_add = _helpers.bloom_add
 340
 341
 342 class ShaBloom:
 343     """Wrapper which contains data from multiple index files.
 344     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 345     and make it possible for bup to expand Git's indexing capabilities to vast
 346     amounts of files.
 347     """
 348     def __init__(self, filename, f=None, readwrite=False):
 349         self.name = filename
 350         assert(filename.endswith('.bloom'))
 351         if readwrite:
 352             self.rwfile = f or open(filename, 'r+b')
 353             self.map = mmap_readwrite(self.rwfile, close=False)
 354         else:
 355             self.rwfile = None
 356             self.map = mmap_read(f or open(filename, 'rb'))
 357         if str(self.map[0:4]) != 'BLOM':
 358             log('Warning: skipping: invalid BLOM header in %r\n' % filename)
 359             return self._init_failed()
 360         ver = struct.unpack('!I', self.map[4:8])[0]
 361         if ver < BLOOM_VERSION:
 362             log('Warning: ignoring old-style (v%d) bloom %r\n'
 363                 % (ver, filename))
 364             return self._init_failed()
 365         if ver > BLOOM_VERSION:
 366             log('Warning: ignoring too-new (v%d) bloom %r\n'
 367                 % (ver, filename))
 368             return self._init_failed()
 369
 370         self.bits, self.k, self.entries = struct.unpack('!HHI', self.map[8:16])
 371         idxnamestr = str(self.map[16 + 2**self.bits:])
 372         if idxnamestr:
 373             self.idxnames = idxnamestr.split('\0')
 374         else:
 375             self.idxnames = []
 376
 377     def _init_failed(self):
 378         if self.map:
 379             self.map.close()
 380             self.map = None
 381         if self.rwfile:
 382             self.rwfile.close()
 383             self.rwfile = None
 384         self.idxnames = []
 385         self.bits = self.entries = 0
 386
 387     def valid(self):
 388         return self.map and self.bits
 389
 390     def __del__(self):
 391         self.close()
 392
 393     def close(self):
 394         if self.map:
 395             if self.rwfile:
 396                 debug2("bloom: closing with %d entries\n" % self.entries)
 397                 self.map[12:16] = struct.pack('!I', self.entries)
 398                 self.map.flush()
 399         if self.rwfile:
 400             self.rwfile.seek(16 + 2**self.bits)
 401             if self.idxnames:
 402                 self.rwfile.write('\0'.join(self.idxnames))
 403         self._init_failed()
 404
 405     def pfalse_positive(self, additional=0):
 406         n = self.entries + additional
 407         m = 8*2**self.bits
 408         k = self.k
 409         return 100*(1-math.exp(-k*float(n)/m))**k
 410
 411     def add_idx(self, ix):
 412         """Add the object to the filter, return current pfalse_positive."""
 413         if not self.map: raise Exception, "Cannot add to closed bloom"
 414         self.entries += bloom_add(self.map, ix.shatable, self.bits, self.k)
 415         self.idxnames.append(os.path.basename(ix.name))
 416
 417     def exists(self, sha):
 418         """Return nonempty if the object probably exists in the bloom filter."""
 419         global _total_searches, _total_steps
 420         _total_searches += 1
 421         if not self.map: return None
 422         found, steps = bloom_contains(self.map, str(sha), self.bits, self.k)
 423         _total_steps += steps
 424         return found
 425
 426     @classmethod
 427     def create(cls, name, f=None, readwrite=False, expected=100000, k=None):
 428         """Create and return a bloom filter for `expected` entries."""
 429         bits = int(math.floor(math.log(expected*MAX_BITS_EACH/8,2)))
 430         k = k or ((bits <= MAX_BLOOM_BITS[5]) and 5 or 4)
 431         if bits > MAX_BLOOM_BITS[k]:
 432             log('bloom: warning, max bits exceeded, non-optimal\n')
 433             bits = MAX_BLOOM_BITS[k]
 434         debug1('bloom: using 2^%d bytes and %d hash functions\n' % (bits, k))
 435         f = f or open(name, 'w+b')
 436         f.write('BLOM')
 437         f.write(struct.pack('!IHHI', BLOOM_VERSION, bits, k, 0))
 438         assert(f.tell() == 16)
 439         f.write('\0'*2**bits)
 440         f.seek(0)
 441         return cls(name, f=f, readwrite=readwrite)
 442
 443     def __len__(self):
 444         return self.entries
 445
 446
 447 class PackMidx:
 448     """Wrapper which contains data from multiple index files.
 449     Multiple index (.midx) files constitute a wrapper around index (.idx) files
 450     and make it possible for bup to expand Git's indexing capabilities to vast
 451     amounts of files.
 452     """
 453     def __init__(self, filename):
 454         self.name = filename
 455         self.force_keep = False
 456         assert(filename.endswith('.midx'))
 457         self.map = mmap_read(open(filename))
 458         if str(self.map[0:4]) != 'MIDX':
 459             log('Warning: skipping: invalid MIDX header in %r\n' % filename)
 460             self.force_keep = True
 461             return self._init_failed()
 462         ver = struct.unpack('!I', self.map[4:8])[0]
 463         if ver < MIDX_VERSION:
 464             log('Warning: ignoring old-style (v%d) midx %r\n'
 465                 % (ver, filename))
 466             self.force_keep = False  # old stuff is boring
 467             return self._init_failed()
 468         if ver > MIDX_VERSION:
 469             log('Warning: ignoring too-new (v%d) midx %r\n'
 470                 % (ver, filename))
 471             self.force_keep = True  # new stuff is exciting
 472             return self._init_failed()
 473
 474         self.bits = _helpers.firstword(self.map[8:12])
 475         self.entries = 2**self.bits
 476         self.fanout = buffer(self.map, 12, self.entries*4)
 477         shaofs = 12 + self.entries*4
 478         nsha = self._fanget(self.entries-1)
 479         self.shatable = buffer(self.map, shaofs, nsha*20)
 480         self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
 481
 482     def _init_failed(self):
 483         self.bits = 0
 484         self.entries = 1
 485         self.fanout = buffer('\0\0\0\0')
 486         self.shatable = buffer('\0'*20)
 487         self.idxnames = []
 488
 489     def _fanget(self, i):
 490         start = i*4
 491         s = self.fanout[start:start+4]
 492         return _helpers.firstword(s)
 493
 494     def _get(self, i):
 495         return str(self.shatable[i*20:(i+1)*20])
 496
 497     def exists(self, hash):
 498         """Return nonempty if the object exists in the index files."""
 499         global _total_searches, _total_steps
 500         _total_searches += 1
 501         want = str(hash)
 502         el = extract_bits(want, self.bits)
 503         if el:
 504             start = self._fanget(el-1)
 505             startv = el << (32-self.bits)
 506         else:
 507             start = 0
 508             startv = 0
 509         end = self._fanget(el)
 510         endv = (el+1) << (32-self.bits)
 511         _total_steps += 1   # lookup table is a step
 512         hashv = _helpers.firstword(hash)
 513         #print '(%08x) %08x %08x %08x' % (extract_bits(want, 32), startv, hashv, endv)
 514         while start < end:
 515             _total_steps += 1
 516             #print '! %08x %08x %08x   %d - %d' % (startv, hashv, endv, start, end)
 517             mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
 518             #print '  %08x %08x %08x   %d %d %d' % (startv, hashv, endv, start, mid, end)
 519             v = self._get(mid)
 520             #print '    %08x' % self._num(v)
 521             if v < want:
 522                 start = mid+1
 523                 startv = _helpers.firstword(v)
 524             elif v > want:
 525                 end = mid
 526                 endv = _helpers.firstword(v)
 527             else: # got it!
 528                 return True
 529         return None
 530
 531     def __iter__(self):
 532         for i in xrange(self._fanget(self.entries-1)):
 533             yield buffer(self.shatable, i*20, 20)
 534
 535     def __len__(self):
 536         return int(self._fanget(self.entries-1))
 537
 538
 539 _mpi_count = 0
 540 class PackIdxList:
 541     def __init__(self, dir):
 542         global _mpi_count
 543         assert(_mpi_count == 0) # these things suck tons of VM; don't waste it
 544         _mpi_count += 1
 545         self.dir = dir
 546         self.also = set()
 547         self.packs = []
 548         self.do_bloom = False
 549         self.bloom = None
 550         self.refresh()
 551
 552     def __del__(self):
 553         global _mpi_count
 554         _mpi_count -= 1
 555         assert(_mpi_count == 0)
 556
 557     def __iter__(self):
 558         return iter(idxmerge(self.packs))
 559
 560     def __len__(self):
 561         return sum(len(pack) for pack in self.packs)
 562
 563     def exists(self, hash):
 564         """Return nonempty if the object exists in the index files."""
 565         global _total_searches
 566         _total_searches += 1
 567         if hash in self.also:
 568             return True
 569         if self.do_bloom and self.bloom is not None:
 570             _total_searches -= 1  # will be incremented by bloom
 571             if self.bloom.exists(hash):
 572                 self.do_bloom = False
 573             else:
 574                 return None
 575         for i in xrange(len(self.packs)):
 576             p = self.packs[i]
 577             _total_searches -= 1  # will be incremented by sub-pack
 578             if p.exists(hash):
 579                 # reorder so most recently used packs are searched first
 580                 self.packs = [p] + self.packs[:i] + self.packs[i+1:]
 581                 return p.name
 582         self.do_bloom = True
 583         return None
 584
 585     def refresh(self, skip_midx = False):
 586         """Refresh the index list.
 587         This method verifies if .midx files were superseded (e.g. all of its
 588         contents are in another, bigger .midx file) and removes the superseded
 589         files.
 590
 591         If skip_midx is True, all work on .midx files will be skipped and .midx
 592         files will be removed from the list.
 593
 594         The module-global variable 'ignore_midx' can force this function to
 595         always act as if skip_midx was True.
 596         """
 597         self.bloom = None # Always reopen the bloom as it may have been relaced
 598         self.do_bloom = False
 599         skip_midx = skip_midx or ignore_midx
 600         d = dict((p.name, p) for p in self.packs
 601                  if not skip_midx or not isinstance(p, PackMidx))
 602         if os.path.exists(self.dir):
 603             if not skip_midx:
 604                 midxl = []
 605                 for ix in self.packs:
 606                     if isinstance(ix, PackMidx):
 607                         for name in ix.idxnames:
 608                             d[os.path.join(self.dir, name)] = ix
 609                 for full in glob.glob(os.path.join(self.dir,'*.midx')):
 610                     if not d.get(full):
 611                         mx = PackMidx(full)
 612                         (mxd, mxf) = os.path.split(mx.name)
 613                         broken = False
 614                         for n in mx.idxnames:
 615                             if not os.path.exists(os.path.join(mxd, n)):
 616                                 log(('warning: index %s missing\n' +
 617                                     '  used by %s\n') % (n, mxf))
 618                                 broken = True
 619                         if broken:
 620                             del mx
 621                             unlink(full)
 622                         else:
 623                             midxl.append(mx)
 624                 midxl.sort(lambda x,y: -cmp(len(x),len(y)))
 625                 for ix in midxl:
 626                     any_needed = False
 627                     for sub in ix.idxnames:
 628                         found = d.get(os.path.join(self.dir, sub))
 629                         if not found or isinstance(found, PackIdx):
 630                             # doesn't exist, or exists but not in a midx
 631                             any_needed = True
 632                             break
 633                     if any_needed:
 634                         d[ix.name] = ix
 635                         for name in ix.idxnames:
 636                             d[os.path.join(self.dir, name)] = ix
 637                     elif not ix.force_keep:
 638                         debug1('midx: removing redundant: %s\n'
 639                                % os.path.basename(ix.name))
 640                         unlink(ix.name)
 641             for full in glob.glob(os.path.join(self.dir,'*.idx')):
 642                 if not d.get(full):
 643                     try:
 644                         ix = open_idx(full)
 645                     except GitError, e:
 646                         add_error(e)
 647                         continue
 648                     d[full] = ix
 649             bfull = os.path.join(self.dir, 'bup.bloom')
 650             if self.bloom is None and os.path.exists(bfull):
 651                 self.bloom = ShaBloom(bfull)
 652             self.packs = list(set(d.values()))
 653             self.packs.sort(lambda x,y: -cmp(len(x),len(y)))
 654             if self.bloom and self.bloom.valid() and len(self.bloom) >= len(self):
 655                 self.do_bloom = True
 656             else:
 657                 self.bloom = None
 658         debug1('PackIdxList: using %d index%s.\n'
 659             % (len(self.packs), len(self.packs)!=1 and 'es' or ''))
 660
 661     def packname_containing(self, hash):
 662         # figure out which pack contains a given hash.
 663         # FIXME: if the midx file format would just *store* this information,
 664         # we could calculate it a lot more efficiently.  But it's not needed
 665         # often, so let's do it like this.
 666         for f in glob.glob(os.path.join(self.dir,'*.idx')):
 667             full = os.path.join(self.dir, f)
 668             try:
 669                 ix = open_idx(full)
 670             except GitError, e:
 671                 add_error(e)
 672                 continue
 673             if ix.exists(hash):
 674                 return full
 675
 676     def add(self, hash):
 677         """Insert an additional object in the list."""
 678         self.also.add(hash)
 679
 680
 681 def calc_hash(type, content):
 682     """Calculate some content's hash in the Git fashion."""
 683     header = '%s %d\0' % (type, len(content))
 684     sum = Sha1(header)
 685     sum.update(content)
 686     return sum.digest()
 687
 688
 689 def _shalist_sort_key(ent):
 690     (mode, name, id) = ent
 691     if stat.S_ISDIR(int(mode, 8)):
 692         return name + '/'
 693     else:
 694         return name
 695
 696
 697 def open_idx(filename):
 698     if filename.endswith('.idx'):
 699         f = open(filename, 'rb')
 700         header = f.read(8)
 701         if header[0:4] == '\377tOc':
 702             version = struct.unpack('!I', header[4:8])[0]
 703             if version == 2:
 704                 return PackIdxV2(filename, f)
 705             else:
 706                 raise GitError('%s: expected idx file version 2, got %d'
 707                                % (filename, version))
 708         elif len(header) == 8 and header[0:4] < '\377tOc':
 709             return PackIdxV1(filename, f)
 710         else:
 711             raise GitError('%s: unrecognized idx file header' % filename)
 712     elif filename.endswith('.midx'):
 713         return PackMidx(filename)
 714     else:
 715         raise GitError('idx filenames must end with .idx or .midx')
 716
 717
 718 def idxmerge(idxlist, final_progress=True):
 719     """Generate a list of all the objects reachable in a PackIdxList."""
 720     def pfunc(count, total):
 721         progress('Reading indexes: %.2f%% (%d/%d)\r'
 722                  % (count*100.0/total, count, total))
 723     def pfinal(count, total):
 724         if final_progress:
 725             log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
 726     return merge_iter(idxlist, 10024, pfunc, pfinal)
 727
 728
 729 def _make_objcache():
 730     return PackIdxList(repo('objects/pack'))
 731
 732 class PackWriter:
 733     """Writes Git objects insid a pack file."""
 734     def __init__(self, objcache_maker=_make_objcache):
 735         self.count = 0
 736         self.outbytes = 0
 737         self.filename = None
 738         self.file = None
 739         self.idx = None
 740         self.objcache_maker = objcache_maker
 741         self.objcache = None
 742
 743     def __del__(self):
 744         self.close()
 745
 746     def _open(self):
 747         if not self.file:
 748             (fd,name) = tempfile.mkstemp(suffix='.pack', dir=repo('objects'))
 749             self.file = os.fdopen(fd, 'w+b')
 750             assert(name.endswith('.pack'))
 751             self.filename = name[:-5]
 752             self.file.write('PACK\0\0\0\2\0\0\0\0')
 753             self.idx = list(list() for i in xrange(256))
 754
 755     # the 'sha' parameter is used in client.py's _raw_write(), but not needed
 756     # in this basic version.
 757     def _raw_write(self, datalist, sha):
 758         self._open()
 759         f = self.file
 760         # in case we get interrupted (eg. KeyboardInterrupt), it's best if
 761         # the file never has a *partial* blob.  So let's make sure it's
 762         # all-or-nothing.  (The blob shouldn't be very big anyway, thanks
 763         # to our hashsplit algorithm.)  f.write() does its own buffering,
 764         # but that's okay because we'll flush it in _end().
 765         oneblob = ''.join(datalist)
 766         try:
 767             f.write(oneblob)
 768         except IOError, e:
 769             raise GitError, e, sys.exc_info()[2]
 770         nw = len(oneblob)
 771         crc = zlib.crc32(oneblob) & 0xffffffff
 772         self._update_idx(sha, crc, nw)
 773         self.outbytes += nw
 774         self.count += 1
 775         return nw, crc
 776
 777     def _update_idx(self, sha, crc, size):
 778         assert(sha)
 779         if self.idx:
 780             self.idx[ord(sha[0])].append((sha, crc, self.file.tell() - size))
 781
 782     def _write(self, sha, type, content):
 783         if verbose:
 784             log('>')
 785         if not sha:
 786             sha = calc_hash(type, content)
 787         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
 788         return sha
 789
 790     def breakpoint(self):
 791         """Clear byte and object counts and return the last processed id."""
 792         id = self._end()
 793         self.outbytes = self.count = 0
 794         return id
 795
 796     def _require_objcache(self):
 797         if self.objcache is None and self.objcache_maker:
 798             self.objcache = self.objcache_maker()
 799         if self.objcache is None:
 800             raise GitError(
 801                     "PackWriter not opened or can't check exists w/o objcache")
 802
 803     def exists(self, id):
 804         """Return non-empty if an object is found in the object cache."""
 805         self._require_objcache()
 806         return self.objcache.exists(id)
 807
 808     def maybe_write(self, type, content):
 809         """Write an object to the pack file if not present and return its id."""
 810         self._require_objcache()
 811         sha = calc_hash(type, content)
 812         if not self.exists(sha):
 813             self._write(sha, type, content)
 814             self.objcache.add(sha)
 815         return sha
 816
 817     def new_blob(self, blob):
 818         """Create a blob object in the pack with the supplied content."""
 819         return self.maybe_write('blob', blob)
 820
 821     def new_tree(self, shalist):
 822         """Create a tree object in the pack."""
 823         shalist = sorted(shalist, key = _shalist_sort_key)
 824         l = []
 825         for (mode,name,bin) in shalist:
 826             assert(mode)
 827             assert(mode != '0')
 828             assert(mode[0] != '0')
 829             assert(name)
 830             assert(len(bin) == 20)
 831             l.append('%s %s\0%s' % (mode,name,bin))
 832         return self.maybe_write('tree', ''.join(l))
 833
 834     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
 835         l = []
 836         if tree: l.append('tree %s' % tree.encode('hex'))
 837         if parent: l.append('parent %s' % parent.encode('hex'))
 838         if author: l.append('author %s %s' % (author, _git_date(adate)))
 839         if committer: l.append('committer %s %s' % (committer, _git_date(cdate)))
 840         l.append('')
 841         l.append(msg)
 842         return self.maybe_write('commit', '\n'.join(l))
 843
 844     def new_commit(self, parent, tree, date, msg):
 845         """Create a commit object in the pack."""
 846         userline = '%s <%s@%s>' % (userfullname(), username(), hostname())
 847         commit = self._new_commit(tree, parent,
 848                                   userline, date, userline, date,
 849                                   msg)
 850         return commit
 851
 852     def abort(self):
 853         """Remove the pack file from disk."""
 854         f = self.file
 855         if f:
 856             self.idx = None
 857             self.file = None
 858             f.close()
 859             os.unlink(self.filename + '.pack')
 860
 861     def _end(self, run_midx=True):
 862         f = self.file
 863         if not f: return None
 864         self.file = None
 865         self.objcache = None
 866         idx = self.idx
 867         self.idx = None
 868
 869         # update object count
 870         f.seek(8)
 871         cp = struct.pack('!i', self.count)
 872         assert(len(cp) == 4)
 873         f.write(cp)
 874
 875         # calculate the pack sha1sum
 876         f.seek(0)
 877         sum = Sha1()
 878         for b in chunkyreader(f):
 879             sum.update(b)
 880         packbin = sum.digest()
 881         f.write(packbin)
 882         f.close()
 883
 884         idx_f = open(self.filename + '.idx', 'wb')
 885         obj_list_sha = self._write_pack_idx_v2(idx_f, idx, packbin)
 886         idx_f.close()
 887
 888         nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
 889         if os.path.exists(self.filename + '.map'):
 890             os.unlink(self.filename + '.map')
 891         os.rename(self.filename + '.pack', nameprefix + '.pack')
 892         os.rename(self.filename + '.idx', nameprefix + '.idx')
 893
 894         if run_midx:
 895             auto_midx(repo('objects/pack'))
 896         return nameprefix
 897
 898     def close(self, run_midx=True):
 899         """Close the pack file and move it to its definitive path."""
 900         return self._end(run_midx=run_midx)
 901
 902     def _write_pack_idx_v2(self, file, idx, packbin):
 903         sum = Sha1()
 904
 905         def write(data):
 906             file.write(data)
 907             sum.update(data)
 908
 909         write('\377tOc\0\0\0\2')
 910
 911         n = 0
 912         for part in idx:
 913             n += len(part)
 914             write(struct.pack('!i', n))
 915             part.sort(key=lambda x: x[0])
 916
 917         obj_list_sum = Sha1()
 918         for part in idx:
 919             for entry in part:
 920                 write(entry[0])
 921                 obj_list_sum.update(entry[0])
 922         for part in idx:
 923             for entry in part:
 924                 write(struct.pack('!I', entry[1]))
 925         ofs64_list = []
 926         for part in idx:
 927             for entry in part:
 928                 if entry[2] & 0x80000000:
 929                     write(struct.pack('!I', 0x80000000 | len(ofs64_list)))
 930                     ofs64_list.append(struct.pack('!Q', entry[2]))
 931                 else:
 932                     write(struct.pack('!i', entry[2]))
 933         for ofs64 in ofs64_list:
 934             write(ofs64)
 935
 936         write(packbin)
 937         file.write(sum.digest())
 938         return obj_list_sum.hexdigest()
 939
 940
 941 def _git_date(date):
 942     return '%d %s' % (date, time.strftime('%z', time.localtime(date)))
 943
 944
 945 def _gitenv():
 946     os.environ['GIT_DIR'] = os.path.abspath(repo())
 947
 948
 949 def list_refs(refname = None):
 950     """Generate a list of tuples in the form (refname,hash).
 951     If a ref name is specified, list only this particular ref.
 952     """
 953     argv = ['git', 'show-ref', '--']
 954     if refname:
 955         argv += [refname]
 956     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 957     out = p.stdout.read().strip()
 958     rv = p.wait()  # not fatal
 959     if rv:
 960         assert(not out)
 961     if out:
 962         for d in out.split('\n'):
 963             (sha, name) = d.split(' ', 1)
 964             yield (name, sha.decode('hex'))
 965
 966
 967 def read_ref(refname):
 968     """Get the commit id of the most recent commit made on a given ref."""
 969     l = list(list_refs(refname))
 970     if l:
 971         assert(len(l) == 1)
 972         return l[0][1]
 973     else:
 974         return None
 975
 976
 977 def rev_list(ref, count=None):
 978     """Generate a list of reachable commits in reverse chronological order.
 979
 980     This generator walks through commits, from child to parent, that are
 981     reachable via the specified ref and yields a series of tuples of the form
 982     (date,hash).
 983
 984     If count is a non-zero integer, limit the number of commits to "count"
 985     objects.
 986     """
 987     assert(not ref.startswith('-'))
 988     opts = []
 989     if count:
 990         opts += ['-n', str(atoi(count))]
 991     argv = ['git', 'rev-list', '--pretty=format:%ct'] + opts + [ref, '--']
 992     p = subprocess.Popen(argv, preexec_fn = _gitenv, stdout = subprocess.PIPE)
 993     commit = None
 994     for row in p.stdout:
 995         s = row.strip()
 996         if s.startswith('commit '):
 997             commit = s[7:].decode('hex')
 998         else:
 999             date = int(s)
1000             yield (date, commit)
1001     rv = p.wait()  # not fatal
1002     if rv:
1003         raise GitError, 'git rev-list returned error %d' % rv
1004
1005
1006 def rev_get_date(ref):
1007     """Get the date of the latest commit on the specified ref."""
1008     for (date, commit) in rev_list(ref, count=1):
1009         return date
1010     raise GitError, 'no such commit %r' % ref
1011
1012
1013 def rev_parse(committish):
1014     """Resolve the full hash for 'committish', if it exists.
1015
1016     Should be roughly equivalent to 'git rev-parse'.
1017
1018     Returns the hex value of the hash if it is found, None if 'committish' does
1019     not correspond to anything.
1020     """
1021     head = read_ref(committish)
1022     if head:
1023         debug2("resolved from ref: commit = %s\n" % head.encode('hex'))
1024         return head
1025
1026     pL = PackIdxList(repo('objects/pack'))
1027
1028     if len(committish) == 40:
1029         try:
1030             hash = committish.decode('hex')
1031         except TypeError:
1032             return None
1033
1034         if pL.exists(hash):
1035             return hash
1036
1037     return None
1038
1039
1040 def update_ref(refname, newval, oldval):
1041     """Change the commit pointed to by a branch."""
1042     if not oldval:
1043         oldval = ''
1044     assert(refname.startswith('refs/heads/'))
1045     p = subprocess.Popen(['git', 'update-ref', refname,
1046                           newval.encode('hex'), oldval.encode('hex')],
1047                          preexec_fn = _gitenv)
1048     _git_wait('git update-ref', p)
1049
1050
1051 def guess_repo(path=None):
1052     """Set the path value in the global variable "repodir".
1053     This makes bup look for an existing bup repository, but not fail if a
1054     repository doesn't exist. Usually, if you are interacting with a bup
1055     repository, you would not be calling this function but using
1056     check_repo_or_die().
1057     """
1058     global repodir
1059     if path:
1060         repodir = path
1061     if not repodir:
1062         repodir = os.environ.get('BUP_DIR')
1063         if not repodir:
1064             repodir = os.path.expanduser('~/.bup')
1065
1066
1067 def init_repo(path=None):
1068     """Create the Git bare repository for bup in a given path."""
1069     guess_repo(path)
1070     d = repo()  # appends a / to the path
1071     parent = os.path.dirname(os.path.dirname(d))
1072     if parent and not os.path.exists(parent):
1073         raise GitError('parent directory "%s" does not exist\n' % parent)
1074     if os.path.exists(d) and not os.path.isdir(os.path.join(d, '.')):
1075         raise GitError('"%d" exists but is not a directory\n' % d)
1076     p = subprocess.Popen(['git', '--bare', 'init'], stdout=sys.stderr,
1077                          preexec_fn = _gitenv)
1078     _git_wait('git init', p)
1079     # Force the index version configuration in order to ensure bup works
1080     # regardless of the version of the installed Git binary.
1081     p = subprocess.Popen(['git', 'config', 'pack.indexVersion', '2'],
1082                          stdout=sys.stderr, preexec_fn = _gitenv)
1083     _git_wait('git config', p)
1084
1085
1086 def check_repo_or_die(path=None):
1087     """Make sure a bup repository exists, and abort if not.
1088     If the path to a particular repository was not specified, this function
1089     initializes the default repository automatically.
1090     """
1091     guess_repo(path)
1092     if not os.path.isdir(repo('objects/pack/.')):
1093         if repodir == home_repodir:
1094             init_repo()
1095         else:
1096             log('error: %r is not a bup/git repository\n' % repo())
1097             sys.exit(15)
1098
1099
1100 def treeparse(buf):
1101     """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
1102     ofs = 0
1103     while ofs < len(buf):
1104         z = buf[ofs:].find('\0')
1105         assert(z > 0)
1106         spl = buf[ofs:ofs+z].split(' ', 1)
1107         assert(len(spl) == 2)
1108         sha = buf[ofs+z+1:ofs+z+1+20]
1109         ofs += z+1+20
1110         yield (spl[0], spl[1], sha)
1111
1112
1113 _ver = None
1114 def ver():
1115     """Get Git's version and ensure a usable version is installed.
1116
1117     The returned version is formatted as an ordered tuple with each position
1118     representing a digit in the version tag. For example, the following tuple
1119     would represent version 1.6.6.9:
1120
1121         ('1', '6', '6', '9')
1122     """
1123     global _ver
1124     if not _ver:
1125         p = subprocess.Popen(['git', '--version'],
1126                              stdout=subprocess.PIPE)
1127         gvs = p.stdout.read()
1128         _git_wait('git --version', p)
1129         m = re.match(r'git version (\S+.\S+)', gvs)
1130         if not m:
1131             raise GitError('git --version weird output: %r' % gvs)
1132         _ver = tuple(m.group(1).split('.'))
1133     needed = ('1','5', '3', '1')
1134     if _ver < needed:
1135         raise GitError('git version %s or higher is required; you have %s'
1136                        % ('.'.join(needed), '.'.join(_ver)))
1137     return _ver
1138
1139
1140 def _git_wait(cmd, p):
1141     rv = p.wait()
1142     if rv != 0:
1143         raise GitError('%s returned %d' % (cmd, rv))
1144
1145
1146 def _git_capture(argv):
1147     p = subprocess.Popen(argv, stdout=subprocess.PIPE, preexec_fn = _gitenv)
1148     r = p.stdout.read()
1149     _git_wait(repr(argv), p)
1150     return r
1151
1152
1153 class _AbortableIter:
1154     def __init__(self, it, onabort = None):
1155         self.it = it
1156         self.onabort = onabort
1157         self.done = None
1158
1159     def __iter__(self):
1160         return self
1161
1162     def next(self):
1163         try:
1164             return self.it.next()
1165         except StopIteration, e:
1166             self.done = True
1167             raise
1168         except:
1169             self.abort()
1170             raise
1171
1172     def abort(self):
1173         """Abort iteration and call the abortion callback, if needed."""
1174         if not self.done:
1175             self.done = True
1176             if self.onabort:
1177                 self.onabort()
1178
1179     def __del__(self):
1180         self.abort()
1181
1182
1183 _ver_warned = 0
1184 class CatPipe:
1185     """Link to 'git cat-file' that is used to retrieve blob data."""
1186     def __init__(self):
1187         global _ver_warned
1188         wanted = ('1','5','6')
1189         if ver() < wanted:
1190             if not _ver_warned:
1191                 log('warning: git version < %s; bup will be slow.\n'
1192                     % '.'.join(wanted))
1193                 _ver_warned = 1
1194             self.get = self._slow_get
1195         else:
1196             self.p = self.inprogress = None
1197             self.get = self._fast_get
1198
1199     def _abort(self):
1200         if self.p:
1201             self.p.stdout.close()
1202             self.p.stdin.close()
1203         self.p = None
1204         self.inprogress = None
1205
1206     def _restart(self):
1207         self._abort()
1208         self.p = subprocess.Popen(['git', 'cat-file', '--batch'],
1209                                   stdin=subprocess.PIPE,
1210                                   stdout=subprocess.PIPE,
1211                                   close_fds = True,
1212                                   bufsize = 4096,
1213                                   preexec_fn = _gitenv)
1214
1215     def _fast_get(self, id):
1216         if not self.p or self.p.poll() != None:
1217             self._restart()
1218         assert(self.p)
1219         assert(self.p.poll() == None)
1220         if self.inprogress:
1221             log('_fast_get: opening %r while %r is open'
1222                 % (id, self.inprogress))
1223         assert(not self.inprogress)
1224         assert(id.find('\n') < 0)
1225         assert(id.find('\r') < 0)
1226         assert(not id.startswith('-'))
1227         self.inprogress = id
1228         self.p.stdin.write('%s\n' % id)
1229         self.p.stdin.flush()
1230         hdr = self.p.stdout.readline()
1231         if hdr.endswith(' missing\n'):
1232             self.inprogress = None
1233             raise KeyError('blob %r is missing' % id)
1234         spl = hdr.split(' ')
1235         if len(spl) != 3 or len(spl[0]) != 40:
1236             raise GitError('expected blob, got %r' % spl)
1237         (hex, type, size) = spl
1238
1239         it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
1240                            onabort = self._abort)
1241         try:
1242             yield type
1243             for blob in it:
1244                 yield blob
1245             assert(self.p.stdout.readline() == '\n')
1246             self.inprogress = None
1247         except Exception, e:
1248             it.abort()
1249             raise
1250
1251     def _slow_get(self, id):
1252         assert(id.find('\n') < 0)
1253         assert(id.find('\r') < 0)
1254         assert(id[0] != '-')
1255         type = _git_capture(['git', 'cat-file', '-t', id]).strip()
1256         yield type
1257
1258         p = subprocess.Popen(['git', 'cat-file', type, id],
1259                              stdout=subprocess.PIPE,
1260                              preexec_fn = _gitenv)
1261         for blob in chunkyreader(p.stdout):
1262             yield blob
1263         _git_wait('git cat-file', p)
1264
1265     def _join(self, it):
1266         type = it.next()
1267         if type == 'blob':
1268             for blob in it:
1269                 yield blob
1270         elif type == 'tree':
1271             treefile = ''.join(it)
1272             for (mode, name, sha) in treeparse(treefile):
1273                 for blob in self.join(sha.encode('hex')):
1274                     yield blob
1275         elif type == 'commit':
1276             treeline = ''.join(it).split('\n')[0]
1277             assert(treeline.startswith('tree '))
1278             for blob in self.join(treeline[5:]):
1279                 yield blob
1280         else:
1281             raise GitError('invalid object type %r: expected blob/tree/commit'
1282                            % type)
1283
1284     def join(self, id):
1285         """Generate a list of the content of all blobs that can be reached
1286         from an object.  The hash given in 'id' must point to a blob, a tree
1287         or a commit. The content of all blobs that can be seen from trees or
1288         commits will be added to the list.
1289         """
1290         try:
1291             for d in self._join(self.get(id)):
1292                 yield d
1293         except StopIteration:
1294             log('booger!\n')
1295
1296 def tags():
1297     """Return a dictionary of all tags in the form {hash: [tag_names, ...]}."""
1298     tags = {}
1299     for (n,c) in list_refs():
1300         if n.startswith('refs/tags/'):
1301             name = n[10:]
1302             if not c in tags:
1303                 tags[c] = []
1304
1305             tags[c].append(name)  # more than one tag can point at 'c'
1306
1307     return tags