]> arthur.barton.de Git - bup.git/blob - hashsplit.py
cmd-save: fix a potential divide by zero error.
[bup.git] / hashsplit.py
1 import sys
2 import git, _hashsplit
3 from helpers import *
4
5 BLOB_LWM = 8192*2
6 BLOB_MAX = BLOB_LWM*2
7 BLOB_HWM = 1024*1024
8 split_verbosely = 0
9 progress_callback = None
10 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
11 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
12 fanout = 4096
13
14 class Buf:
15     def __init__(self):
16         self.data = ''
17         self.start = 0
18
19     def put(self, s):
20         #log('oldsize=%d+%d adding=%d\n' % (len(self.data), self.start, len(s)))
21         if s:
22             self.data = buffer(self.data, self.start) + s
23             self.start = 0
24             
25     def peek(self, count):
26         return buffer(self.data, self.start, count)
27     
28     def eat(self, count):
29         self.start += count
30
31     def get(self, count):
32         v = buffer(self.data, self.start, count)
33         self.start += count
34         return v
35
36     def used(self):
37         return len(self.data) - self.start
38
39
40 def splitbuf(buf):
41     b = buf.peek(buf.used())
42     ofs = _hashsplit.splitbuf(b)
43     if ofs:
44         if split_verbosely >= 2:
45             log('.')
46         buf.eat(ofs)
47         return buffer(b, 0, ofs)
48     return None
49
50
51 def blobiter(files):
52     for f in files:
53         b = 1
54         while b:
55             b = f.read(BLOB_HWM)
56             if b:
57                 yield b
58     yield '' # EOF indicator
59
60
61 def autofiles(filenames):
62     if not filenames:
63         yield sys.stdin
64     else:
65         for n in filenames:
66             yield open(n)
67             
68     
69 def hashsplit_iter(w, files):
70     ofs = 0
71     buf = Buf()
72     fi = blobiter(files)
73     blob = 1
74
75     eof = 0
76     lv = 0
77     while blob or not eof:
78         if not eof and (buf.used() < BLOB_LWM or not blob):
79             bnew = fi.next()
80             if not bnew: eof = 1
81             #log('got %d, total %d\n' % (len(bnew), buf.used()))
82             buf.put(bnew)
83
84         blob = splitbuf(buf)
85         if eof and not blob:
86             blob = buf.get(buf.used())
87         if not blob and buf.used() >= BLOB_MAX:
88             blob = buf.get(buf.used())  # limit max blob size
89         if not blob and not eof:
90             continue
91
92         if blob:
93             if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
94                 w.breakpoint()
95             yield (ofs, len(blob), w.new_blob(blob))
96             ofs += len(blob)
97           
98         nv = (ofs + buf.used())/1000000
99         if nv != lv:
100             if split_verbosely >= 1:
101                 log('%d\t' % nv)
102             lv = nv
103
104
105 total_split = 0
106 def _split_to_shalist(w, files):
107     global total_split
108     ofs = 0
109     last_ofs = 0
110     for (ofs, size, sha) in hashsplit_iter(w, files):
111         #log('SPLIT @ %-8d size=%-8d\n' % (ofs, size))
112         # this silliness keeps chunk filenames "similar" when a file changes
113         # slightly.
114         bm = BLOB_MAX
115         while 1:
116             cn = ofs / bm * bm
117             #log('%x,%x,%x,%x\n' % (last_ofs,ofs,cn,bm))
118             if cn > last_ofs or ofs == last_ofs: break
119             bm /= 2
120         last_ofs = cn
121         total_split += size
122         if progress_callback:
123             progress_callback(size)
124         yield ('100644', 'bup.chunk.%016x' % cn, sha)
125
126
127 def _next(i):
128     try:
129         return i.next()
130     except StopIteration:
131         return None
132
133
134 def split_to_shalist(w, files):
135     sl = iter(_split_to_shalist(w, files))
136     if not fanout:
137         shalist = list(sl)
138     else:
139         shalist = []
140         tmplist = []
141         for e in sl:
142             tmplist.append(e)
143             if len(tmplist) >= fanout and len(tmplist) >= 3:
144                 shalist.append(('40000', tmplist[0][1], w.new_tree(tmplist)))
145                 tmplist = []
146         shalist += tmplist
147     return shalist
148
149
150 def split_to_blob_or_tree(w, files):
151     shalist = list(split_to_shalist(w, files))
152     if len(shalist) == 1:
153         return (shalist[0][0], shalist[0][2])
154     elif len(shalist) == 0:
155         return ('100644', w.new_blob(''))
156     else:
157         return ('40000', w.new_tree(shalist))