]> arthur.barton.de Git - bup.git/blob - hashsplit.py
bup join: continue gracefully if one of the requested files does not exist.
[bup.git] / hashsplit.py
1 import sys, math
2 import git, _hashsplit
3 from helpers import *
4
5 BLOB_LWM = 8192*2
6 BLOB_MAX = BLOB_LWM*2
7 BLOB_HWM = 1024*1024
8 MAX_PER_TREE = 256
9 progress_callback = None
10 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
11 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
12 fanout = 16
13
14 class Buf:
15     def __init__(self):
16         self.data = ''
17         self.start = 0
18
19     def put(self, s):
20         if s:
21             self.data = buffer(self.data, self.start) + s
22             self.start = 0
23             
24     def peek(self, count):
25         return buffer(self.data, self.start, count)
26     
27     def eat(self, count):
28         self.start += count
29
30     def get(self, count):
31         v = buffer(self.data, self.start, count)
32         self.start += count
33         return v
34
35     def used(self):
36         return len(self.data) - self.start
37
38
39 def splitbuf(buf):
40     b = buf.peek(buf.used())
41     (ofs, bits) = _hashsplit.splitbuf(b)
42     if ofs:
43         buf.eat(ofs)
44         return (buffer(b, 0, ofs), bits)
45     return (None, 0)
46
47
48 def blobiter(files):
49     for f in files:
50         while 1:
51             b = f.read(BLOB_HWM)
52             if not b:
53                 break
54             yield b
55
56
57 def hashsplit_iter(files):
58     assert(BLOB_HWM > BLOB_MAX)
59     buf = Buf()
60     fi = blobiter(files)
61     while 1:
62         (blob, bits) = splitbuf(buf)
63         if blob:
64             yield (blob, bits)
65         else:
66             if buf.used() >= BLOB_MAX:
67                 # limit max blob size
68                 yield (buf.get(buf.used()), 0)
69             while buf.used() < BLOB_HWM:
70                 bnew = next(fi)
71                 if not bnew:
72                     # eof
73                     if buf.used():
74                         yield (buf.get(buf.used()), 0)
75                     return
76                 buf.put(bnew)
77
78
79 total_split = 0
80 def _split_to_blobs(w, files):
81     global total_split
82     for (blob, bits) in hashsplit_iter(files):
83         sha = w.new_blob(blob)
84         total_split += len(blob)
85         if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
86             w.breakpoint()
87         if progress_callback:
88             progress_callback(len(blob))
89         yield (sha, len(blob), bits)
90
91
92 def _make_shalist(l):
93     ofs = 0
94     shalist = []
95     for (mode, sha, size) in l:
96         shalist.append((mode, '%016x' % ofs, sha))
97         ofs += size
98     total = ofs
99     return (shalist, total)
100
101
102 def _squish(w, stacks, n):
103     i = 0
104     while i<n or len(stacks[i]) > MAX_PER_TREE:
105         while len(stacks) <= i+1:
106             stacks.append([])
107         if len(stacks[i]) == 1:
108             stacks[i+1] += stacks[i]
109         elif stacks[i]:
110             (shalist, size) = _make_shalist(stacks[i])
111             tree = w.new_tree(shalist)
112             stacks[i+1].append(('40000', tree, size))
113         stacks[i] = []
114         i += 1
115
116
117 def split_to_shalist(w, files):
118     sl = _split_to_blobs(w, files)
119     if not fanout:
120         shal = []
121         for (sha,size,bits) in sl:
122             shal.append(('100644', sha, size))
123         return _make_shalist(shal)[0]
124     else:
125         base_bits = _hashsplit.blobbits()
126         fanout_bits = int(math.log(fanout, 2))
127         def bits_to_idx(n):
128             assert(n >= base_bits)
129             return (n - base_bits)/fanout_bits
130         stacks = [[]]
131         for (sha,size,bits) in sl:
132             assert(bits <= 32)
133             stacks[0].append(('100644', sha, size))
134             if bits > base_bits:
135                 _squish(w, stacks, bits_to_idx(bits))
136         #log('stacks: %r\n' % [len(i) for i in stacks])
137         _squish(w, stacks, len(stacks)-1)
138         #log('stacks: %r\n' % [len(i) for i in stacks])
139         return _make_shalist(stacks[-1])[0]
140
141
142 def split_to_blob_or_tree(w, files):
143     shalist = list(split_to_shalist(w, files))
144     if len(shalist) == 1:
145         return (shalist[0][0], shalist[0][2])
146     elif len(shalist) == 0:
147         return ('100644', w.new_blob(''))
148     else:
149         return ('40000', w.new_tree(shalist))