hashsplit.py

   1 import sys
   2 import git, _hashsplit
   3 from helpers import *
   4
   5 BLOB_LWM = 8192*2
   6 BLOB_MAX = BLOB_LWM*2
   7 BLOB_HWM = 1024*1024
   8 split_verbosely = 0
   9 progress_callback = None
  10 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
  11 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  12 fanout = 4096
  13
  14 class Buf:
  15     def __init__(self):
  16         self.data = ''
  17         self.start = 0
  18
  19     def put(self, s):
  20         #log('oldsize=%d+%d adding=%d\n' % (len(self.data), self.start, len(s)))
  21         if s:
  22             self.data = buffer(self.data, self.start) + s
  23             self.start = 0
  24
  25     def peek(self, count):
  26         return buffer(self.data, self.start, count)
  27
  28     def eat(self, count):
  29         self.start += count
  30
  31     def get(self, count):
  32         v = buffer(self.data, self.start, count)
  33         self.start += count
  34         return v
  35
  36     def used(self):
  37         return len(self.data) - self.start
  38
  39
  40 def splitbuf(buf):
  41     b = buf.peek(buf.used())
  42     ofs = _hashsplit.splitbuf(b)
  43     if ofs:
  44         if split_verbosely >= 2:
  45             log('.')
  46         buf.eat(ofs)
  47         return buffer(b, 0, ofs)
  48     return None
  49
  50
  51 def blobiter(files):
  52     for f in files:
  53         b = 1
  54         while b:
  55             b = f.read(BLOB_HWM)
  56             if b:
  57                 yield b
  58     yield '' # EOF indicator
  59
  60
  61 def autofiles(filenames):
  62     if not filenames:
  63         yield sys.stdin
  64     else:
  65         for n in filenames:
  66             yield open(n)
  67
  68
  69 def hashsplit_iter(w, files):
  70     ofs = 0
  71     buf = Buf()
  72     fi = blobiter(files)
  73     blob = 1
  74
  75     eof = 0
  76     lv = 0
  77     while blob or not eof:
  78         if not eof and (buf.used() < BLOB_LWM or not blob):
  79             bnew = fi.next()
  80             if not bnew: eof = 1
  81             #log('got %d, total %d\n' % (len(bnew), buf.used()))
  82             buf.put(bnew)
  83
  84         blob = splitbuf(buf)
  85         if eof and not blob:
  86             blob = buf.get(buf.used())
  87         if not blob and buf.used() >= BLOB_MAX:
  88             blob = buf.get(buf.used())  # limit max blob size
  89         if not blob and not eof:
  90             continue
  91
  92         if blob:
  93             if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
  94                 w.breakpoint()
  95             yield (ofs, len(blob), w.new_blob(blob))
  96             ofs += len(blob)
  97
  98         nv = (ofs + buf.used())/1000000
  99         if nv != lv:
 100             if split_verbosely >= 1:
 101                 log('%d\t' % nv)
 102             lv = nv
 103
 104
 105 total_split = 0
 106 def _split_to_shalist(w, files):
 107     global total_split
 108     ofs = 0
 109     last_ofs = 0
 110     for (ofs, size, sha) in hashsplit_iter(w, files):
 111         #log('SPLIT @ %-8d size=%-8d\n' % (ofs, size))
 112         # this silliness keeps chunk filenames "similar" when a file changes
 113         # slightly.
 114         bm = BLOB_MAX
 115         while 1:
 116             cn = ofs / bm * bm
 117             #log('%x,%x,%x,%x\n' % (last_ofs,ofs,cn,bm))
 118             if cn > last_ofs or ofs == last_ofs: break
 119             bm /= 2
 120         last_ofs = cn
 121         total_split += size
 122         if progress_callback:
 123             progress_callback(size)
 124         yield ('100644', 'bup.chunk.%016x' % cn, sha)
 125
 126
 127 def _next(i):
 128     try:
 129         return i.next()
 130     except StopIteration:
 131         return None
 132
 133
 134 def split_to_shalist(w, files):
 135     sl = iter(_split_to_shalist(w, files))
 136     if not fanout:
 137         shalist = list(sl)
 138     else:
 139         shalist = []
 140         tmplist = []
 141         for e in sl:
 142             tmplist.append(e)
 143             if len(tmplist) >= fanout and len(tmplist) >= 3:
 144                 shalist.append(('40000', tmplist[0][1], w.new_tree(tmplist)))
 145                 tmplist = []
 146         shalist += tmplist
 147     return shalist
 148
 149
 150 def split_to_blob_or_tree(w, files):
 151     shalist = list(split_to_shalist(w, files))
 152     if len(shalist) == 1:
 153         return (shalist[0][0], shalist[0][2])
 154     elif len(shalist) == 0:
 155         return ('100644', w.new_blob(''))
 156     else:
 157         return ('40000', w.new_tree(shalist))