]> arthur.barton.de Git - bup.git/blob - hashsplit.py
cmd-index: correct reporting of deleted vs. added vs. modified status.
[bup.git] / hashsplit.py
1 import sys
2 import git, chashsplit
3 from helpers import *
4
5 BLOB_LWM = 8192*2
6 BLOB_MAX = BLOB_LWM*2
7 BLOB_HWM = 1024*1024
8 split_verbosely = 0
9 max_pack_size = 1000*1000*1000
10 max_pack_objects = 10*1000*1000
11 fanout = 4096
12
13 class Buf:
14     def __init__(self):
15         self.data = ''
16         self.start = 0
17
18     def put(self, s):
19         #log('oldsize=%d+%d adding=%d\n' % (len(self.data), self.start, len(s)))
20         if s:
21             self.data = buffer(self.data, self.start) + s
22             self.start = 0
23             
24     def peek(self, count):
25         return buffer(self.data, self.start, count)
26     
27     def eat(self, count):
28         self.start += count
29
30     def get(self, count):
31         v = buffer(self.data, self.start, count)
32         self.start += count
33         return v
34
35     def used(self):
36         return len(self.data) - self.start
37
38
39 def splitbuf(buf):
40     global split_verbosely
41     b = buf.peek(buf.used())
42     ofs = chashsplit.splitbuf(b)
43     if ofs:
44         if split_verbosely >= 2:
45             log('.')
46         buf.eat(ofs)
47         return buffer(b, 0, ofs)
48     return None
49
50
51 def blobiter(files):
52     for f in files:
53         b = 1
54         while b:
55             b = f.read(BLOB_HWM)
56             if b:
57                 yield b
58     yield '' # EOF indicator
59
60
61 def autofiles(filenames):
62     if not filenames:
63         yield sys.stdin
64     else:
65         for n in filenames:
66             yield open(n)
67             
68     
69 def hashsplit_iter(w, files):
70     global split_verbosely
71     ofs = 0
72     buf = Buf()
73     fi = blobiter(files)
74     blob = 1
75
76     eof = 0
77     lv = 0
78     while blob or not eof:
79         if not eof and (buf.used() < BLOB_LWM or not blob):
80             bnew = fi.next()
81             if not bnew: eof = 1
82             #log('got %d, total %d\n' % (len(bnew), buf.used()))
83             buf.put(bnew)
84
85         blob = splitbuf(buf)
86         if eof and not blob:
87             blob = buf.get(buf.used())
88         if not blob and buf.used() >= BLOB_MAX:
89             blob = buf.get(buf.used())  # limit max blob size
90         if not blob and not eof:
91             continue
92
93         if blob:
94             if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
95                 w.breakpoint()
96             yield (ofs, len(blob), w.new_blob(blob))
97             ofs += len(blob)
98           
99         nv = (ofs + buf.used())/1000000
100         if nv != lv:
101             if split_verbosely >= 1:
102                 log('%d\t' % nv)
103             lv = nv
104
105
106 total_split = 0
107 def split_to_shalist(w, files):
108     global total_split
109     ofs = 0
110     last_ofs = 0
111     for (ofs, size, sha) in hashsplit_iter(w, files):
112         #log('SPLIT @ %-8d size=%-8d\n' % (ofs, size))
113         # this silliness keeps chunk filenames "similar" when a file changes
114         # slightly.
115         bm = BLOB_MAX
116         while 1:
117             cn = ofs / bm * bm
118             #log('%x,%x,%x,%x\n' % (last_ofs,ofs,cn,bm))
119             if cn > last_ofs or ofs == last_ofs: break
120             bm /= 2
121         last_ofs = cn
122         total_split += size
123         yield ('100644', 'bup.chunk.%016x' % cn, sha)
124
125
126 def _next(i):
127     try:
128         return i.next()
129     except StopIteration:
130         return None
131
132
133 def split_to_tree(w, files):
134     sl = iter(split_to_shalist(w, files))
135     if not fanout:
136         shalist = list(sl)
137     else:
138         shalist = []
139         tmplist = []
140         for e in sl:
141             tmplist.append(e)
142             if len(tmplist) >= fanout and len(tmplist) >= 3:
143                 shalist.append(('40000', tmplist[0][1], w.new_tree(tmplist)))
144                 tmplist = []
145         shalist += tmplist
146     tree = w.new_tree(shalist)
147     return (shalist, tree)
148
149
150 def split_to_blob_or_tree(w, files):
151     shalist = list(split_to_shalist(w, files))
152     if len(shalist) == 1:
153         return (shalist[0][0], shalist[0][2])
154     elif len(shalist) == 0:
155         return ('100644', w.new_blob(''))
156     else:
157         return ('40000', w.new_tree(shalist))