]> arthur.barton.de Git - bup.git/blob - lib/bup/hashsplit.py
save-cmd: open files with O_NOATIME on OSes that support it.
[bup.git] / lib / bup / hashsplit.py
1 import sys, math
2 from bup import git, _hashsplit
3 from bup.helpers import *
4
5 BLOB_LWM = 8192*2
6 BLOB_MAX = BLOB_LWM*2
7 BLOB_HWM = 1024*1024
8 MAX_PER_TREE = 256
9 progress_callback = None
10 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
11 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
12 fanout = 16
13
14 class Buf:
15     def __init__(self):
16         self.data = ''
17         self.start = 0
18
19     def put(self, s):
20         if s:
21             self.data = buffer(self.data, self.start) + s
22             self.start = 0
23             
24     def peek(self, count):
25         return buffer(self.data, self.start, count)
26     
27     def eat(self, count):
28         self.start += count
29
30     def get(self, count):
31         v = buffer(self.data, self.start, count)
32         self.start += count
33         return v
34
35     def used(self):
36         return len(self.data) - self.start
37
38
39 def splitbuf(buf):
40     b = buf.peek(buf.used())
41     (ofs, bits) = _hashsplit.splitbuf(b)
42     if ofs:
43         buf.eat(ofs)
44         return (buffer(b, 0, ofs), bits)
45     return (None, 0)
46
47
48 def blobiter(files):
49     for f in files:
50         while 1:
51             b = f.read(BLOB_HWM)
52             if not b:
53                 break
54             yield b
55
56
57 def drainbuf(buf, finalize):
58     while 1:
59         (blob, bits) = splitbuf(buf)
60         if blob:
61             yield (blob, bits)
62         else:
63             break
64     if buf.used() > BLOB_MAX:
65         # limit max blob size
66         yield (buf.get(buf.used()), 0)
67     elif finalize and buf.used():
68         yield (buf.get(buf.used()), 0)
69
70
71 def hashsplit_iter(files):
72     assert(BLOB_HWM > BLOB_MAX)
73     buf = Buf()
74     fi = blobiter(files)
75     while 1:
76         for i in drainbuf(buf, finalize=False):
77             yield i
78         while buf.used() < BLOB_HWM:
79             bnew = next(fi)
80             if not bnew:
81                 # eof
82                 for i in drainbuf(buf, finalize=True):
83                     yield i
84                 return
85             buf.put(bnew)
86
87
88 total_split = 0
89 def _split_to_blobs(w, files):
90     global total_split
91     for (blob, bits) in hashsplit_iter(files):
92         sha = w.new_blob(blob)
93         total_split += len(blob)
94         if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
95             w.breakpoint()
96         if progress_callback:
97             progress_callback(len(blob))
98         yield (sha, len(blob), bits)
99
100
101 def _make_shalist(l):
102     ofs = 0
103     shalist = []
104     for (mode, sha, size) in l:
105         shalist.append((mode, '%016x' % ofs, sha))
106         ofs += size
107     total = ofs
108     return (shalist, total)
109
110
111 def _squish(w, stacks, n):
112     i = 0
113     while i<n or len(stacks[i]) > MAX_PER_TREE:
114         while len(stacks) <= i+1:
115             stacks.append([])
116         if len(stacks[i]) == 1:
117             stacks[i+1] += stacks[i]
118         elif stacks[i]:
119             (shalist, size) = _make_shalist(stacks[i])
120             tree = w.new_tree(shalist)
121             stacks[i+1].append(('40000', tree, size))
122         stacks[i] = []
123         i += 1
124
125
126 def split_to_shalist(w, files):
127     sl = _split_to_blobs(w, files)
128     if not fanout:
129         shal = []
130         for (sha,size,bits) in sl:
131             shal.append(('100644', sha, size))
132         return _make_shalist(shal)[0]
133     else:
134         base_bits = _hashsplit.blobbits()
135         fanout_bits = int(math.log(fanout, 2))
136         def bits_to_idx(n):
137             assert(n >= base_bits)
138             return (n - base_bits)/fanout_bits
139         stacks = [[]]
140         for (sha,size,bits) in sl:
141             assert(bits <= 32)
142             stacks[0].append(('100644', sha, size))
143             if bits > base_bits:
144                 _squish(w, stacks, bits_to_idx(bits))
145         #log('stacks: %r\n' % [len(i) for i in stacks])
146         _squish(w, stacks, len(stacks)-1)
147         #log('stacks: %r\n' % [len(i) for i in stacks])
148         return _make_shalist(stacks[-1])[0]
149
150
151 def split_to_blob_or_tree(w, files):
152     shalist = list(split_to_shalist(w, files))
153     if len(shalist) == 1:
154         return (shalist[0][0], shalist[0][2])
155     elif len(shalist) == 0:
156         return ('100644', w.new_blob(''))
157     else:
158         return ('40000', w.new_tree(shalist))
159
160
161 def open_noatime(name):
162     fd = _hashsplit.open_noatime(name)
163     try:
164         return os.fdopen(fd, 'rb', 1024*1024)
165     except:
166         try:
167             os.close(fd)
168         except:
169             pass
170         raise