X-Git-Url: https://arthur.barton.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=lib%2Fbup%2Fhashsplit.py;h=3cbcfc981b4e1280604712b2b0ffe2fbeff9411e;hb=c40b3dd5fd74e72024fbaad3daf5a958aefa1c54;hp=345f67bc097273dc10ee3732d58320f8ec31de26;hpb=e166d6da6dbe23706b24c3a9485ea8ae7b6c7503;p=bup.git diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py index 345f67b..3cbcfc9 100644 --- a/lib/bup/hashsplit.py +++ b/lib/bup/hashsplit.py @@ -1,6 +1,11 @@ -import math -from bup import _helpers -from bup.helpers import * + +from __future__ import absolute_import +import io, math, os + +from bup import _helpers, helpers +from bup.helpers import sc_page_size + +_fmincore = getattr(helpers, 'fmincore', None) BLOB_MAX = 8192*4 # 8192 is the "typical" blob size for bupsplit BLOB_READ_SIZE = 1024*1024 @@ -8,10 +13,9 @@ MAX_PER_TREE = 256 progress_callback = None fanout = 16 -GIT_MODE_FILE = 0100644 -GIT_MODE_TREE = 040000 -GIT_MODE_SYMLINK = 0120000 -assert(GIT_MODE_TREE != 40000) # 0xxx should be treated as octal +GIT_MODE_FILE = 0o100644 +GIT_MODE_TREE = 0o40000 +GIT_MODE_SYMLINK = 0o120000 # The purpose of this type of buffer is to avoid copying on peek(), get(), # and eat(). We do copy the buffer contents on put(), but that should @@ -41,31 +45,93 @@ class Buf: return len(self.data) - self.start +def _fadvise_pages_done(fd, first_page, count): + assert(first_page >= 0) + assert(count >= 0) + if count > 0: + _helpers.fadvise_done(fd, + first_page * sc_page_size, + count * sc_page_size) + + +def _nonresident_page_regions(status_bytes, incore_mask, max_region_len=None): + """Return (start_page, count) pairs in ascending start_page order for + each contiguous region of nonresident pages indicated by the + mincore() status_bytes. Limit the number of pages in each region + to max_region_len.""" + assert(max_region_len is None or max_region_len > 0) + start = None + for i, x in enumerate(status_bytes): + in_core = x & incore_mask + if start is None: + if not in_core: + start = i + else: + count = i - start + if in_core: + yield (start, count) + start = None + elif max_region_len and count >= max_region_len: + yield (start, count) + start = i + if start is not None: + yield (start, len(status_bytes) - start) + + +def _uncache_ours_upto(fd, offset, first_region, remaining_regions): + """Uncache the pages of fd indicated by first_region and + remaining_regions that are before offset, where each region is a + (start_page, count) pair. The final region must have a start_page + of None.""" + rstart, rlen = first_region + while rstart is not None and (rstart + rlen) * sc_page_size <= offset: + _fadvise_pages_done(fd, rstart, rlen) + rstart, rlen = next(remaining_regions, (None, None)) + return (rstart, rlen) + + def readfile_iter(files, progress=None): for filenum,f in enumerate(files): ofs = 0 b = '' + fd = rpr = rstart = rlen = None + if _fmincore and hasattr(f, 'fileno'): + try: + fd = f.fileno() + except io.UnsupportedOperation: + pass + if fd: + mcore = _fmincore(fd) + if mcore: + max_chunk = max(1, (8 * 1024 * 1024) / sc_page_size) + rpr = _nonresident_page_regions(mcore, helpers.MINCORE_INCORE, + max_chunk) + rstart, rlen = next(rpr, (None, None)) while 1: if progress: progress(filenum, len(b)) - fadvise_done(f, max(0, ofs - 1024*1024)) b = f.read(BLOB_READ_SIZE) ofs += len(b) + if rpr: + rstart, rlen = _uncache_ours_upto(fd, ofs, (rstart, rlen), rpr) if not b: - fadvise_done(f, ofs) break yield b + if rpr: + rstart, rlen = _uncache_ours_upto(fd, ofs, (rstart, rlen), rpr) def _splitbuf(buf, basebits, fanbits): while 1: b = buf.peek(buf.used()) (ofs, bits) = _helpers.splitbuf(b) - if ofs > BLOB_MAX: - ofs = BLOB_MAX if ofs: + if ofs > BLOB_MAX: + ofs = BLOB_MAX + level = 0 + else: + level = (bits-basebits)//fanbits # integer division buf.eat(ofs) - level = (bits-basebits)//fanbits # integer division yield buffer(b, 0, ofs), level else: break @@ -166,9 +232,10 @@ def split_to_shalist(makeblob, maketree, files, return _make_shalist(stacks[-1])[0] -def split_to_blob_or_tree(makeblob, maketree, files, keep_boundaries): +def split_to_blob_or_tree(makeblob, maketree, files, + keep_boundaries, progress=None): shalist = list(split_to_shalist(makeblob, maketree, - files, keep_boundaries)) + files, keep_boundaries, progress)) if len(shalist) == 1: return (shalist[0][0], shalist[0][2]) elif len(shalist) == 0: @@ -187,9 +254,3 @@ def open_noatime(name): except: pass raise - - -def fadvise_done(f, ofs): - assert(ofs >= 0) - if ofs > 0 and hasattr(f, 'fileno'): - _helpers.fadvise_done(f.fileno(), ofs)