lib/bup/helpers.py

   1 """Helper functions and classes for bup."""
   2
   3 from __future__ import absolute_import, division
   4 from collections import namedtuple
   5 from contextlib import contextmanager
   6 from ctypes import sizeof, c_void_p
   7 from math import floor
   8 from os import environ
   9 from subprocess import PIPE, Popen
  10 import sys, os, subprocess, errno, select, mmap, stat, re, struct
  11 import hashlib, heapq, math, operator, time, tempfile
  12
  13 from bup import _helpers
  14 from bup import compat
  15 from bup.compat import argv_bytes, byte_int, nullcontext, pending_raise
  16 from bup.io import byte_stream, path_msg
  17 # This function should really be in helpers, not in bup.options.  But we
  18 # want options.py to be standalone so people can include it in other projects.
  19 from bup.options import _tty_width as tty_width
  20
  21
  22 buglvl = int(os.environ.get('BUP_DEBUG', 0))
  23
  24
  25 class Nonlocal:
  26     """Helper to deal with Python scoping issues"""
  27     pass
  28
  29
  30 def nullcontext_if_not(manager):
  31     return manager if manager is not None else nullcontext()
  32
  33
  34 @contextmanager
  35 def finalized(enter_result=None, finalize=None):
  36     assert finalize
  37     try:
  38         yield enter_result
  39     except BaseException as ex:
  40         with pending_raise(ex):
  41             finalize(enter_result)
  42     finalize(enter_result)
  43
  44
  45 sc_page_size = os.sysconf('SC_PAGE_SIZE')
  46 assert(sc_page_size > 0)
  47
  48 sc_arg_max = os.sysconf('SC_ARG_MAX')
  49 if sc_arg_max == -1:  # "no definite limit" - let's choose 2M
  50     sc_arg_max = 2 * 1024 * 1024
  51
  52 def last(iterable):
  53     result = None
  54     for result in iterable:
  55         pass
  56     return result
  57
  58 try:
  59     _fdatasync = os.fdatasync
  60 except AttributeError:
  61     _fdatasync = os.fsync
  62
  63 if sys.platform.startswith('darwin'):
  64     # Apparently os.fsync on OS X doesn't guarantee to sync all the way down
  65     import fcntl
  66     def fdatasync(fd):
  67         try:
  68             return fcntl.fcntl(fd, fcntl.F_FULLFSYNC)
  69         except IOError as e:
  70             # Fallback for file systems (SMB) that do not support F_FULLFSYNC
  71             if e.errno == errno.ENOTSUP:
  72                 return _fdatasync(fd)
  73             else:
  74                 raise
  75 else:
  76     fdatasync = _fdatasync
  77
  78
  79 def partition(predicate, stream):
  80     """Returns (leading_matches_it, rest_it), where leading_matches_it
  81     must be completely exhausted before traversing rest_it.
  82
  83     """
  84     stream = iter(stream)
  85     ns = Nonlocal()
  86     ns.first_nonmatch = None
  87     def leading_matches():
  88         for x in stream:
  89             if predicate(x):
  90                 yield x
  91             else:
  92                 ns.first_nonmatch = (x,)
  93                 break
  94     def rest():
  95         if ns.first_nonmatch:
  96             yield ns.first_nonmatch[0]
  97             for x in stream:
  98                 yield x
  99     return (leading_matches(), rest())
 100
 101
 102 def merge_dict(*xs):
 103     result = {}
 104     for x in xs:
 105         result.update(x)
 106     return result
 107
 108
 109 def lines_until_sentinel(f, sentinel, ex_type):
 110     # sentinel must end with \n and must contain only one \n
 111     while True:
 112         line = f.readline()
 113         if not (line and line.endswith(b'\n')):
 114             raise ex_type('Hit EOF while reading line')
 115         if line == sentinel:
 116             return
 117         yield line
 118
 119
 120 def stat_if_exists(path):
 121     try:
 122         return os.stat(path)
 123     except OSError as e:
 124         if e.errno != errno.ENOENT:
 125             raise
 126     return None
 127
 128
 129 # Write (blockingly) to sockets that may or may not be in blocking mode.
 130 # We need this because our stderr is sometimes eaten by subprocesses
 131 # (probably ssh) that sometimes make it nonblocking, if only temporarily,
 132 # leading to race conditions.  Ick.  We'll do it the hard way.
 133 def _hard_write(fd, buf):
 134     while buf:
 135         (r,w,x) = select.select([], [fd], [], None)
 136         if not w:
 137             raise IOError('select(fd) returned without being writable')
 138         try:
 139             sz = os.write(fd, buf)
 140         except OSError as e:
 141             if e.errno != errno.EAGAIN:
 142                 raise
 143         assert(sz >= 0)
 144         buf = buf[sz:]
 145
 146
 147 _last_prog = 0
 148 def log(s):
 149     """Print a log message to stderr."""
 150     global _last_prog
 151     sys.stdout.flush()
 152     _hard_write(sys.stderr.fileno(), s if isinstance(s, bytes) else s.encode())
 153     _last_prog = 0
 154
 155
 156 def debug1(s):
 157     if buglvl >= 1:
 158         log(s)
 159
 160
 161 def debug2(s):
 162     if buglvl >= 2:
 163         log(s)
 164
 165
 166 istty1 = os.isatty(1) or (int(os.environ.get('BUP_FORCE_TTY', 0)) & 1)
 167 istty2 = os.isatty(2) or (int(os.environ.get('BUP_FORCE_TTY', 0)) & 2)
 168 _last_progress = ''
 169 def progress(s):
 170     """Calls log() if stderr is a TTY.  Does nothing otherwise."""
 171     global _last_progress
 172     if istty2:
 173         log(s)
 174         _last_progress = s
 175
 176
 177 def qprogress(s):
 178     """Calls progress() only if we haven't printed progress in a while.
 179
 180     This avoids overloading the stderr buffer with excess junk.
 181     """
 182     global _last_prog
 183     now = time.time()
 184     if now - _last_prog > 0.1:
 185         progress(s)
 186         _last_prog = now
 187
 188
 189 def reprogress():
 190     """Calls progress() to redisplay the most recent progress message.
 191
 192     Useful after you've printed some other message that wipes out the
 193     progress line.
 194     """
 195     if _last_progress and _last_progress.endswith('\r'):
 196         progress(_last_progress)
 197
 198
 199 def mkdirp(d, mode=None):
 200     """Recursively create directories on path 'd'.
 201
 202     Unlike os.makedirs(), it doesn't raise an exception if the last element of
 203     the path already exists.
 204     """
 205     try:
 206         if mode:
 207             os.makedirs(d, mode)
 208         else:
 209             os.makedirs(d)
 210     except OSError as e:
 211         if e.errno == errno.EEXIST:
 212             pass
 213         else:
 214             raise
 215
 216
 217 class MergeIterItem:
 218     def __init__(self, entry, read_it):
 219         self.entry = entry
 220         self.read_it = read_it
 221     def __lt__(self, x):
 222         return self.entry < x.entry
 223
 224 def merge_iter(iters, pfreq, pfunc, pfinal, key=None):
 225     if key:
 226         samekey = lambda e, pe: getattr(e, key) == getattr(pe, key, None)
 227     else:
 228         samekey = operator.eq
 229     count = 0
 230     total = sum(len(it) for it in iters)
 231     iters = (iter(it) for it in iters)
 232     heap = ((next(it, None),it) for it in iters)
 233     heap = [MergeIterItem(e, it) for e, it in heap if e]
 234
 235     heapq.heapify(heap)
 236     pe = None
 237     while heap:
 238         if not count % pfreq:
 239             pfunc(count, total)
 240         e, it = heap[0].entry, heap[0].read_it
 241         if not samekey(e, pe):
 242             pe = e
 243             yield e
 244         count += 1
 245         try:
 246             e = next(it)
 247         except StopIteration:
 248             heapq.heappop(heap) # remove current
 249         else:
 250             # shift current to new location
 251             heapq.heapreplace(heap, MergeIterItem(e, it))
 252     pfinal(count, total)
 253
 254
 255 def unlink(f):
 256     """Delete a file at path 'f' if it currently exists.
 257
 258     Unlike os.unlink(), does not throw an exception if the file didn't already
 259     exist.
 260     """
 261     try:
 262         os.unlink(f)
 263     except OSError as e:
 264         if e.errno != errno.ENOENT:
 265             raise
 266
 267
 268 _bq_simple_id_rx = re.compile(br'^[-_./a-zA-Z0-9]+$')
 269 _sq_simple_id_rx = re.compile(r'^[-_./a-zA-Z0-9]+$')
 270
 271 def bquote(x):
 272     if x == b'':
 273         return b"''"
 274     if _bq_simple_id_rx.match(x):
 275         return x
 276     return b"'%s'" % x.replace(b"'", b"'\"'\"'")
 277
 278 def squote(x):
 279     if x == '':
 280         return "''"
 281     if _sq_simple_id_rx.match(x):
 282         return x
 283     return "'%s'" % x.replace("'", "'\"'\"'")
 284
 285 def quote(x):
 286     if isinstance(x, bytes):
 287         return bquote(x)
 288     if isinstance(x, compat.str_type):
 289         return squote(x)
 290     assert False
 291     # some versions of pylint get confused
 292     return None
 293
 294 def shstr(cmd):
 295     """Return a shell quoted string for cmd if it's a sequence, else cmd.
 296
 297     cmd must be a string, bytes, or a sequence of one or the other,
 298     and the assumption is that if cmd is a string or bytes, then it's
 299     already quoted (because it's what's actually being passed to
 300     call() and friends.  e.g. log(shstr(cmd)); call(cmd)
 301
 302     """
 303     if isinstance(cmd, (bytes, compat.str_type)):
 304         return cmd
 305     elif all(isinstance(x, bytes) for x in cmd):
 306         return b' '.join(map(bquote, cmd))
 307     elif all(isinstance(x, compat.str_type) for x in cmd):
 308         return ' '.join(map(squote, cmd))
 309     raise TypeError('unsupported shstr argument: ' + repr(cmd))
 310
 311
 312 exc = subprocess.check_call
 313
 314 def exo(cmd,
 315         input=None,
 316         stdin=None,
 317         stderr=None,
 318         shell=False,
 319         check=True,
 320         preexec_fn=None,
 321         close_fds=True):
 322     if input:
 323         assert stdin in (None, PIPE)
 324         stdin = PIPE
 325     p = Popen(cmd,
 326               stdin=stdin, stdout=PIPE, stderr=stderr,
 327               shell=shell,
 328               preexec_fn=preexec_fn,
 329               close_fds=close_fds)
 330     out, err = p.communicate(input)
 331     if check and p.returncode != 0:
 332         raise Exception('subprocess %r failed with status %d%s'
 333                         % (b' '.join(map(quote, cmd)), p.returncode,
 334                            ', stderr: %r' % err if err else ''))
 335     return out, err, p
 336
 337 def readpipe(argv, preexec_fn=None, shell=False):
 338     """Run a subprocess and return its output."""
 339     return exo(argv, preexec_fn=preexec_fn, shell=shell)[0]
 340
 341
 342 def _argmax_base(command):
 343     base_size = 2048
 344     for c in command:
 345         base_size += len(command) + 1
 346     for k, v in compat.items(environ):
 347         base_size += len(k) + len(v) + 2 + sizeof(c_void_p)
 348     return base_size
 349
 350
 351 def _argmax_args_size(args):
 352     return sum(len(x) + 1 + sizeof(c_void_p) for x in args)
 353
 354
 355 def batchpipe(command, args, preexec_fn=None, arg_max=sc_arg_max):
 356     """If args is not empty, yield the output produced by calling the
 357 command list with args as a sequence of strings (It may be necessary
 358 to return multiple strings in order to respect ARG_MAX)."""
 359     # The optional arg_max arg is a workaround for an issue with the
 360     # current wvtest behavior.
 361     base_size = _argmax_base(command)
 362     while args:
 363         room = arg_max - base_size
 364         i = 0
 365         while i < len(args):
 366             next_size = _argmax_args_size(args[i:i+1])
 367             if room - next_size < 0:
 368                 break
 369             room -= next_size
 370             i += 1
 371         sub_args = args[:i]
 372         args = args[i:]
 373         assert(len(sub_args))
 374         yield readpipe(command + sub_args, preexec_fn=preexec_fn)
 375
 376
 377 def resolve_parent(p):
 378     """Return the absolute path of a file without following any final symlink.
 379
 380     Behaves like os.path.realpath, but doesn't follow a symlink for the last
 381     element. (ie. if 'p' itself is a symlink, this one won't follow it, but it
 382     will follow symlinks in p's directory)
 383     """
 384     try:
 385         st = os.lstat(p)
 386     except OSError:
 387         st = None
 388     if st and stat.S_ISLNK(st.st_mode):
 389         (dir, name) = os.path.split(p)
 390         dir = os.path.realpath(dir)
 391         out = os.path.join(dir, name)
 392     else:
 393         out = os.path.realpath(p)
 394     #log('realpathing:%r,%r\n' % (p, out))
 395     return out
 396
 397
 398 def detect_fakeroot():
 399     "Return True if we appear to be running under fakeroot."
 400     return os.getenv("FAKEROOTKEY") != None
 401
 402
 403 if sys.platform.startswith('cygwin'):
 404     def is_superuser():
 405         # https://cygwin.com/ml/cygwin/2015-02/msg00057.html
 406         groups = os.getgroups()
 407         return 544 in groups or 0 in groups
 408 else:
 409     def is_superuser():
 410         return os.geteuid() == 0
 411
 412
 413 def cache_key_value(get_value, key, cache):
 414     """Return (value, was_cached).  If there is a value in the cache
 415     for key, use that, otherwise, call get_value(key) which should
 416     throw a KeyError if there is no value -- in which case the cached
 417     and returned value will be None.
 418     """
 419     try: # Do we already have it (or know there wasn't one)?
 420         value = cache[key]
 421         return value, True
 422     except KeyError:
 423         pass
 424     value = None
 425     try:
 426         cache[key] = value = get_value(key)
 427     except KeyError:
 428         cache[key] = None
 429     return value, False
 430
 431
 432 _hostname = None
 433 def hostname():
 434     """Get the FQDN of this machine."""
 435     global _hostname
 436     if not _hostname:
 437         _hostname = _helpers.gethostname()
 438     return _hostname
 439
 440
 441 def format_filesize(size):
 442     unit = 1024.0
 443     size = float(size)
 444     if size < unit:
 445         return "%d" % (size)
 446     exponent = int(math.log(size) // math.log(unit))
 447     size_prefix = "KMGTPE"[exponent - 1]
 448     return "%.1f%s" % (size / math.pow(unit, exponent), size_prefix)
 449
 450
 451 class NotOk(Exception):
 452     pass
 453
 454
 455 class BaseConn:
 456     def __init__(self, outp):
 457         self.outp = outp
 458
 459     def close(self):
 460         while self._read(65536): pass
 461
 462     def _read(self, size):
 463         raise NotImplementedError("Subclasses must implement _read")
 464
 465     def read(self, size):
 466         """Read 'size' bytes from input stream."""
 467         self.outp.flush()
 468         return self._read(size)
 469
 470     def _readline(self, size):
 471         raise NotImplementedError("Subclasses must implement _readline")
 472
 473     def readline(self):
 474         """Read from input stream until a newline is found."""
 475         self.outp.flush()
 476         return self._readline()
 477
 478     def write(self, data):
 479         """Write 'data' to output stream."""
 480         #log('%d writing: %d bytes\n' % (os.getpid(), len(data)))
 481         self.outp.write(data)
 482
 483     def has_input(self):
 484         """Return true if input stream is readable."""
 485         raise NotImplementedError("Subclasses must implement has_input")
 486
 487     def ok(self):
 488         """Indicate end of output from last sent command."""
 489         self.write(b'\nok\n')
 490
 491     def error(self, s):
 492         """Indicate server error to the client."""
 493         s = re.sub(br'\s+', b' ', s)
 494         self.write(b'\nerror %s\n' % s)
 495
 496     def _check_ok(self, onempty):
 497         self.outp.flush()
 498         rl = b''
 499         for rl in linereader(self):
 500             #log('%d got line: %r\n' % (os.getpid(), rl))
 501             if not rl:  # empty line
 502                 continue
 503             elif rl == b'ok':
 504                 return None
 505             elif rl.startswith(b'error '):
 506                 #log('client: error: %s\n' % rl[6:])
 507                 return NotOk(rl[6:])
 508             else:
 509                 onempty(rl)
 510         raise Exception('server exited unexpectedly; see errors above')
 511
 512     def drain_and_check_ok(self):
 513         """Remove all data for the current command from input stream."""
 514         def onempty(rl):
 515             pass
 516         return self._check_ok(onempty)
 517
 518     def check_ok(self):
 519         """Verify that server action completed successfully."""
 520         def onempty(rl):
 521             raise Exception('expected "ok", got %r' % rl)
 522         return self._check_ok(onempty)
 523
 524
 525 class Conn(BaseConn):
 526     def __init__(self, inp, outp):
 527         BaseConn.__init__(self, outp)
 528         self.inp = inp
 529
 530     def _read(self, size):
 531         return self.inp.read(size)
 532
 533     def _readline(self):
 534         return self.inp.readline()
 535
 536     def has_input(self):
 537         [rl, wl, xl] = select.select([self.inp.fileno()], [], [], 0)
 538         if rl:
 539             assert(rl[0] == self.inp.fileno())
 540             return True
 541         else:
 542             return None
 543
 544
 545 def checked_reader(fd, n):
 546     while n > 0:
 547         rl, _, _ = select.select([fd], [], [])
 548         assert(rl[0] == fd)
 549         buf = os.read(fd, n)
 550         if not buf: raise Exception("Unexpected EOF reading %d more bytes" % n)
 551         yield buf
 552         n -= len(buf)
 553
 554
 555 MAX_PACKET = 128 * 1024
 556 def mux(p, outfd, outr, errr):
 557     try:
 558         fds = [outr, errr]
 559         while p.poll() is None:
 560             rl, _, _ = select.select(fds, [], [])
 561             for fd in rl:
 562                 if fd == outr:
 563                     buf = os.read(outr, MAX_PACKET)
 564                     if not buf: break
 565                     os.write(outfd, struct.pack('!IB', len(buf), 1) + buf)
 566                 elif fd == errr:
 567                     buf = os.read(errr, 1024)
 568                     if not buf: break
 569                     os.write(outfd, struct.pack('!IB', len(buf), 2) + buf)
 570     finally:
 571         os.write(outfd, struct.pack('!IB', 0, 3))
 572
 573
 574 class DemuxConn(BaseConn):
 575     """A helper class for bup's client-server protocol."""
 576     def __init__(self, infd, outp):
 577         BaseConn.__init__(self, outp)
 578         # Anything that comes through before the sync string was not
 579         # multiplexed and can be assumed to be debug/log before mux init.
 580         tail = b''
 581         stderr = byte_stream(sys.stderr)
 582         while tail != b'BUPMUX':
 583             # Make sure to write all pre-BUPMUX output to stderr
 584             b = os.read(infd, (len(tail) < 6) and (6-len(tail)) or 1)
 585             if not b:
 586                 ex = IOError('demux: unexpected EOF during initialization')
 587                 with pending_raise(ex):
 588                     stderr.write(tail)
 589                     stderr.flush()
 590             tail += b
 591             stderr.write(tail[:-6])
 592             tail = tail[-6:]
 593         stderr.flush()
 594         self.infd = infd
 595         self.reader = None
 596         self.buf = None
 597         self.closed = False
 598
 599     def write(self, data):
 600         self._load_buf(0)
 601         BaseConn.write(self, data)
 602
 603     def _next_packet(self, timeout):
 604         if self.closed: return False
 605         rl, wl, xl = select.select([self.infd], [], [], timeout)
 606         if not rl: return False
 607         assert(rl[0] == self.infd)
 608         ns = b''.join(checked_reader(self.infd, 5))
 609         n, fdw = struct.unpack('!IB', ns)
 610         if n > MAX_PACKET:
 611             # assume that something went wrong and print stuff
 612             ns += os.read(self.infd, 1024)
 613             stderr = byte_stream(sys.stderr)
 614             stderr.write(ns)
 615             stderr.flush()
 616             raise Exception("Connection broken")
 617         if fdw == 1:
 618             self.reader = checked_reader(self.infd, n)
 619         elif fdw == 2:
 620             for buf in checked_reader(self.infd, n):
 621                 byte_stream(sys.stderr).write(buf)
 622         elif fdw == 3:
 623             self.closed = True
 624             debug2("DemuxConn: marked closed\n")
 625         return True
 626
 627     def _load_buf(self, timeout):
 628         if self.buf is not None:
 629             return True
 630         while not self.closed:
 631             while not self.reader:
 632                 if not self._next_packet(timeout):
 633                     return False
 634             try:
 635                 self.buf = next(self.reader)
 636                 return True
 637             except StopIteration:
 638                 self.reader = None
 639         return False
 640
 641     def _read_parts(self, ix_fn):
 642         while self._load_buf(None):
 643             assert(self.buf is not None)
 644             i = ix_fn(self.buf)
 645             if i is None or i == len(self.buf):
 646                 yv = self.buf
 647                 self.buf = None
 648             else:
 649                 yv = self.buf[:i]
 650                 self.buf = self.buf[i:]
 651             yield yv
 652             if i is not None:
 653                 break
 654
 655     def _readline(self):
 656         def find_eol(buf):
 657             try:
 658                 return buf.index(b'\n')+1
 659             except ValueError:
 660                 return None
 661         return b''.join(self._read_parts(find_eol))
 662
 663     def _read(self, size):
 664         csize = [size]
 665         def until_size(buf): # Closes on csize
 666             if len(buf) < csize[0]:
 667                 csize[0] -= len(buf)
 668                 return None
 669             else:
 670                 return csize[0]
 671         return b''.join(self._read_parts(until_size))
 672
 673     def has_input(self):
 674         return self._load_buf(0)
 675
 676
 677 def linereader(f):
 678     """Generate a list of input lines from 'f' without terminating newlines."""
 679     while 1:
 680         line = f.readline()
 681         if not line:
 682             break
 683         yield line[:-1]
 684
 685
 686 def chunkyreader(f, count = None):
 687     """Generate a list of chunks of data read from 'f'.
 688
 689     If count is None, read until EOF is reached.
 690
 691     If count is a positive integer, read 'count' bytes from 'f'. If EOF is
 692     reached while reading, raise IOError.
 693     """
 694     if count != None:
 695         while count > 0:
 696             b = f.read(min(count, 65536))
 697             if not b:
 698                 raise IOError('EOF with %d bytes remaining' % count)
 699             yield b
 700             count -= len(b)
 701     else:
 702         while 1:
 703             b = f.read(65536)
 704             if not b: break
 705             yield b
 706
 707
 708 @contextmanager
 709 def atomically_replaced_file(name, mode='w', buffering=-1):
 710     """Yield a file that will be atomically renamed name when leaving the block.
 711
 712     This contextmanager yields an open file object that is backed by a
 713     temporary file which will be renamed (atomically) to the target
 714     name if everything succeeds.
 715
 716     The mode and buffering arguments are handled exactly as with open,
 717     and the yielded file will have very restrictive permissions, as
 718     per mkstemp.
 719
 720     E.g.::
 721
 722         with atomically_replaced_file('foo.txt', 'w') as f:
 723             f.write('hello jack.')
 724
 725     """
 726
 727     (ffd, tempname) = tempfile.mkstemp(dir=os.path.dirname(name),
 728                                        text=('b' not in mode))
 729     try:
 730         try:
 731             f = os.fdopen(ffd, mode, buffering)
 732         except:
 733             os.close(ffd)
 734             raise
 735         try:
 736             yield f
 737         finally:
 738             f.close()
 739         os.rename(tempname, name)
 740     finally:
 741         unlink(tempname)  # nonexistant file is ignored
 742
 743
 744 def slashappend(s):
 745     """Append "/" to 's' if it doesn't aleady end in "/"."""
 746     assert isinstance(s, bytes)
 747     if s and not s.endswith(b'/'):
 748         return s + b'/'
 749     else:
 750         return s
 751
 752
 753 def _mmap_do(f, sz, flags, prot, close):
 754     if not sz:
 755         st = os.fstat(f.fileno())
 756         sz = st.st_size
 757     if not sz:
 758         # trying to open a zero-length map gives an error, but an empty
 759         # string has all the same behaviour of a zero-length map, ie. it has
 760         # no elements :)
 761         return ''
 762     map = compat.mmap(f.fileno(), sz, flags, prot)
 763     if close:
 764         f.close()  # map will persist beyond file close
 765     return map
 766
 767
 768 def mmap_read(f, sz = 0, close=True):
 769     """Create a read-only memory mapped region on file 'f'.
 770     If sz is 0, the region will cover the entire file.
 771     """
 772     return _mmap_do(f, sz, mmap.MAP_PRIVATE, mmap.PROT_READ, close)
 773
 774
 775 def mmap_readwrite(f, sz = 0, close=True):
 776     """Create a read-write memory mapped region on file 'f'.
 777     If sz is 0, the region will cover the entire file.
 778     """
 779     return _mmap_do(f, sz, mmap.MAP_SHARED, mmap.PROT_READ|mmap.PROT_WRITE,
 780                     close)
 781
 782
 783 def mmap_readwrite_private(f, sz = 0, close=True):
 784     """Create a read-write memory mapped region on file 'f'.
 785     If sz is 0, the region will cover the entire file.
 786     The map is private, which means the changes are never flushed back to the
 787     file.
 788     """
 789     return _mmap_do(f, sz, mmap.MAP_PRIVATE, mmap.PROT_READ|mmap.PROT_WRITE,
 790                     close)
 791
 792
 793 _mincore = getattr(_helpers, 'mincore', None)
 794 if _mincore:
 795     # ./configure ensures that we're on Linux if MINCORE_INCORE isn't defined.
 796     MINCORE_INCORE = getattr(_helpers, 'MINCORE_INCORE', 1)
 797
 798     _fmincore_chunk_size = None
 799     def _set_fmincore_chunk_size():
 800         global _fmincore_chunk_size
 801         pref_chunk_size = 64 * 1024 * 1024
 802         chunk_size = sc_page_size
 803         if (sc_page_size < pref_chunk_size):
 804             chunk_size = sc_page_size * (pref_chunk_size // sc_page_size)
 805         _fmincore_chunk_size = chunk_size
 806
 807     def fmincore(fd):
 808         """Return the mincore() data for fd as a bytearray whose values can be
 809         tested via MINCORE_INCORE, or None if fd does not fully
 810         support the operation."""
 811         st = os.fstat(fd)
 812         if (st.st_size == 0):
 813             return bytearray(0)
 814         if not _fmincore_chunk_size:
 815             _set_fmincore_chunk_size()
 816         pages_per_chunk = _fmincore_chunk_size // sc_page_size;
 817         page_count = (st.st_size + sc_page_size - 1) // sc_page_size;
 818         chunk_count = (st.st_size + _fmincore_chunk_size - 1) // _fmincore_chunk_size
 819         result = bytearray(page_count)
 820         for ci in compat.range(chunk_count):
 821             pos = _fmincore_chunk_size * ci;
 822             msize = min(_fmincore_chunk_size, st.st_size - pos)
 823             try:
 824                 m = compat.mmap(fd, msize, mmap.MAP_PRIVATE, 0, 0, pos)
 825             except mmap.error as ex:
 826                 if ex.errno == errno.EINVAL or ex.errno == errno.ENODEV:
 827                     # Perhaps the file was a pipe, i.e. "... | bup split ..."
 828                     return None
 829                 raise ex
 830             try:
 831                 _mincore(m, msize, 0, result, ci * pages_per_chunk)
 832             except OSError as ex:
 833                 if ex.errno == errno.ENOSYS:
 834                     return None
 835                 raise
 836         return result
 837
 838
 839 def parse_timestamp(epoch_str):
 840     """Return the number of nanoseconds since the epoch that are described
 841 by epoch_str (100ms, 100ns, ...); when epoch_str cannot be parsed,
 842 throw a ValueError that may contain additional information."""
 843     ns_per = {'s' :  1000000000,
 844               'ms' : 1000000,
 845               'us' : 1000,
 846               'ns' : 1}
 847     match = re.match(r'^((?:[-+]?[0-9]+)?)(s|ms|us|ns)$', epoch_str)
 848     if not match:
 849         if re.match(r'^([-+]?[0-9]+)$', epoch_str):
 850             raise ValueError('must include units, i.e. 100ns, 100ms, ...')
 851         raise ValueError()
 852     (n, units) = match.group(1, 2)
 853     if not n:
 854         n = 1
 855     n = int(n)
 856     return n * ns_per[units]
 857
 858
 859 def parse_num(s):
 860     """Parse string or bytes as a possibly unit suffixed number.
 861
 862     For example:
 863         199.2k means 203981 bytes
 864         1GB means 1073741824 bytes
 865         2.1 tb means 2199023255552 bytes
 866     """
 867     if isinstance(s, bytes):
 868         # FIXME: should this raise a ValueError for UnicodeDecodeError
 869         # (perhaps with the latter as the context).
 870         s = s.decode('ascii')
 871     g = re.match(r'([-+\d.e]+)\s*(\w*)', str(s))
 872     if not g:
 873         raise ValueError("can't parse %r as a number" % s)
 874     (val, unit) = g.groups()
 875     num = float(val)
 876     unit = unit.lower()
 877     if unit in ['t', 'tb']:
 878         mult = 1024*1024*1024*1024
 879     elif unit in ['g', 'gb']:
 880         mult = 1024*1024*1024
 881     elif unit in ['m', 'mb']:
 882         mult = 1024*1024
 883     elif unit in ['k', 'kb']:
 884         mult = 1024
 885     elif unit in ['', 'b']:
 886         mult = 1
 887     else:
 888         raise ValueError("invalid unit %r in number %r" % (unit, s))
 889     return int(num*mult)
 890
 891
 892 saved_errors = []
 893 def add_error(e):
 894     """Append an error message to the list of saved errors.
 895
 896     Once processing is able to stop and output the errors, the saved errors are
 897     accessible in the module variable helpers.saved_errors.
 898     """
 899     saved_errors.append(e)
 900     log('%-70s\n' % e)
 901
 902
 903 def clear_errors():
 904     global saved_errors
 905     saved_errors = []
 906
 907
 908 def die_if_errors(msg=None, status=1):
 909     global saved_errors
 910     if saved_errors:
 911         if not msg:
 912             msg = 'warning: %d errors encountered\n' % len(saved_errors)
 913         log(msg)
 914         sys.exit(status)
 915
 916
 917 def handle_ctrl_c():
 918     """Replace the default exception handler for KeyboardInterrupt (Ctrl-C).
 919
 920     The new exception handler will make sure that bup will exit without an ugly
 921     stacktrace when Ctrl-C is hit.
 922     """
 923     oldhook = sys.excepthook
 924     def newhook(exctype, value, traceback):
 925         if exctype == KeyboardInterrupt:
 926             log('\nInterrupted.\n')
 927         else:
 928             oldhook(exctype, value, traceback)
 929     sys.excepthook = newhook
 930
 931
 932 def columnate(l, prefix):
 933     """Format elements of 'l' in columns with 'prefix' leading each line.
 934
 935     The number of columns is determined automatically based on the string
 936     lengths.
 937     """
 938     binary = isinstance(prefix, bytes)
 939     nothing = b'' if binary else ''
 940     nl = b'\n' if binary else '\n'
 941     if not l:
 942         return nothing
 943     l = l[:]
 944     clen = max(len(s) for s in l)
 945     ncols = (tty_width() - len(prefix)) // (clen + 2)
 946     if ncols <= 1:
 947         ncols = 1
 948         clen = 0
 949     cols = []
 950     while len(l) % ncols:
 951         l.append(nothing)
 952     rows = len(l) // ncols
 953     for s in compat.range(0, len(l), rows):
 954         cols.append(l[s:s+rows])
 955     out = nothing
 956     fmt = b'%-*s' if binary else '%-*s'
 957     for row in zip(*cols):
 958         out += prefix + nothing.join((fmt % (clen+2, s)) for s in row) + nl
 959     return out
 960
 961
 962 def parse_date_or_fatal(str, fatal):
 963     """Parses the given date or calls Option.fatal().
 964     For now we expect a string that contains a float."""
 965     try:
 966         date = float(str)
 967     except ValueError as e:
 968         raise fatal('invalid date format (should be a float): %r' % e)
 969     else:
 970         return date
 971
 972
 973 def parse_excludes(options, fatal):
 974     """Traverse the options and extract all excludes, or call Option.fatal()."""
 975     excluded_paths = []
 976
 977     for flag in options:
 978         (option, parameter) = flag
 979         if option == '--exclude':
 980             excluded_paths.append(resolve_parent(argv_bytes(parameter)))
 981         elif option == '--exclude-from':
 982             try:
 983                 f = open(resolve_parent(argv_bytes(parameter)), 'rb')
 984             except IOError as e:
 985                 raise fatal("couldn't read %r" % parameter)
 986             for exclude_path in f.readlines():
 987                 # FIXME: perhaps this should be rstrip('\n')
 988                 exclude_path = resolve_parent(exclude_path.strip())
 989                 if exclude_path:
 990                     excluded_paths.append(exclude_path)
 991     return sorted(frozenset(excluded_paths))
 992
 993
 994 def parse_rx_excludes(options, fatal):
 995     """Traverse the options and extract all rx excludes, or call
 996     Option.fatal()."""
 997     excluded_patterns = []
 998
 999     for flag in options:
1000         (option, parameter) = flag
1001         if option == '--exclude-rx':
1002             try:
1003                 excluded_patterns.append(re.compile(argv_bytes(parameter)))
1004             except re.error as ex:
1005                 fatal('invalid --exclude-rx pattern (%r): %s' % (parameter, ex))
1006         elif option == '--exclude-rx-from':
1007             try:
1008                 f = open(resolve_parent(parameter), 'rb')
1009             except IOError as e:
1010                 raise fatal("couldn't read %r" % parameter)
1011             for pattern in f.readlines():
1012                 spattern = pattern.rstrip(b'\n')
1013                 if not spattern:
1014                     continue
1015                 try:
1016                     excluded_patterns.append(re.compile(spattern))
1017                 except re.error as ex:
1018                     fatal('invalid --exclude-rx pattern (%r): %s' % (spattern, ex))
1019     return excluded_patterns
1020
1021
1022 def should_rx_exclude_path(path, exclude_rxs):
1023     """Return True if path matches a regular expression in exclude_rxs."""
1024     for rx in exclude_rxs:
1025         if rx.search(path):
1026             debug1('Skipping %r: excluded by rx pattern %r.\n'
1027                    % (path, rx.pattern))
1028             return True
1029     return False
1030
1031
1032 # FIXME: Carefully consider the use of functions (os.path.*, etc.)
1033 # that resolve against the current filesystem in the strip/graft
1034 # functions for example, but elsewhere as well.  I suspect bup's not
1035 # always being careful about that.  For some cases, the contents of
1036 # the current filesystem should be irrelevant, and consulting it might
1037 # produce the wrong result, perhaps via unintended symlink resolution,
1038 # for example.
1039
1040 def path_components(path):
1041     """Break path into a list of pairs of the form (name,
1042     full_path_to_name).  Path must start with '/'.
1043     Example:
1044       '/home/foo' -> [('', '/'), ('home', '/home'), ('foo', '/home/foo')]"""
1045     if not path.startswith(b'/'):
1046         raise Exception('path must start with "/": %s' % path_msg(path))
1047     # Since we assume path startswith('/'), we can skip the first element.
1048     result = [(b'', b'/')]
1049     norm_path = os.path.abspath(path)
1050     if norm_path == b'/':
1051         return result
1052     full_path = b''
1053     for p in norm_path.split(b'/')[1:]:
1054         full_path += b'/' + p
1055         result.append((p, full_path))
1056     return result
1057
1058
1059 def stripped_path_components(path, strip_prefixes):
1060     """Strip any prefix in strip_prefixes from path and return a list
1061     of path components where each component is (name,
1062     none_or_full_fs_path_to_name).  Assume path startswith('/').
1063     See thelpers.py for examples."""
1064     normalized_path = os.path.abspath(path)
1065     sorted_strip_prefixes = sorted(strip_prefixes, key=len, reverse=True)
1066     for bp in sorted_strip_prefixes:
1067         normalized_bp = os.path.abspath(bp)
1068         if normalized_bp == b'/':
1069             continue
1070         if normalized_path.startswith(normalized_bp):
1071             prefix = normalized_path[:len(normalized_bp)]
1072             result = []
1073             for p in normalized_path[len(normalized_bp):].split(b'/'):
1074                 if p: # not root
1075                     prefix += b'/'
1076                 prefix += p
1077                 result.append((p, prefix))
1078             return result
1079     # Nothing to strip.
1080     return path_components(path)
1081
1082
1083 def grafted_path_components(graft_points, path):
1084     # Create a result that consists of some number of faked graft
1085     # directories before the graft point, followed by all of the real
1086     # directories from path that are after the graft point.  Arrange
1087     # for the directory at the graft point in the result to correspond
1088     # to the "orig" directory in --graft orig=new.  See t/thelpers.py
1089     # for some examples.
1090
1091     # Note that given --graft orig=new, orig and new have *nothing* to
1092     # do with each other, even if some of their component names
1093     # match. i.e. --graft /foo/bar/baz=/foo/bar/bax is semantically
1094     # equivalent to --graft /foo/bar/baz=/x/y/z, or even
1095     # /foo/bar/baz=/x.
1096
1097     # FIXME: This can't be the best solution...
1098     clean_path = os.path.abspath(path)
1099     for graft_point in graft_points:
1100         old_prefix, new_prefix = graft_point
1101         # Expand prefixes iff not absolute paths.
1102         old_prefix = os.path.normpath(old_prefix)
1103         new_prefix = os.path.normpath(new_prefix)
1104         if clean_path.startswith(old_prefix):
1105             escaped_prefix = re.escape(old_prefix)
1106             grafted_path = re.sub(br'^' + escaped_prefix, new_prefix, clean_path)
1107             # Handle /foo=/ (at least) -- which produces //whatever.
1108             grafted_path = b'/' + grafted_path.lstrip(b'/')
1109             clean_path_components = path_components(clean_path)
1110             # Count the components that were stripped.
1111             strip_count = 0 if old_prefix == b'/' else old_prefix.count(b'/')
1112             new_prefix_parts = new_prefix.split(b'/')
1113             result_prefix = grafted_path.split(b'/')[:new_prefix.count(b'/')]
1114             result = [(p, None) for p in result_prefix] \
1115                 + clean_path_components[strip_count:]
1116             # Now set the graft point name to match the end of new_prefix.
1117             graft_point = len(result_prefix)
1118             result[graft_point] = \
1119                 (new_prefix_parts[-1], clean_path_components[strip_count][1])
1120             if new_prefix == b'/': # --graft ...=/ is a special case.
1121                 return result[1:]
1122             return result
1123     return path_components(clean_path)
1124
1125
1126 Sha1 = hashlib.sha1
1127
1128
1129 _localtime = getattr(_helpers, 'localtime', None)
1130
1131 if _localtime:
1132     bup_time = namedtuple('bup_time', ['tm_year', 'tm_mon', 'tm_mday',
1133                                        'tm_hour', 'tm_min', 'tm_sec',
1134                                        'tm_wday', 'tm_yday',
1135                                        'tm_isdst', 'tm_gmtoff', 'tm_zone'])
1136
1137 # Define a localtime() that returns bup_time when possible.  Note:
1138 # this means that any helpers.localtime() results may need to be
1139 # passed through to_py_time() before being passed to python's time
1140 # module, which doesn't appear willing to ignore the extra items.
1141 if _localtime:
1142     def localtime(time):
1143         return bup_time(*_helpers.localtime(int(floor(time))))
1144     def utc_offset_str(t):
1145         """Return the local offset from UTC as "+hhmm" or "-hhmm" for time t.
1146         If the current UTC offset does not represent an integer number
1147         of minutes, the fractional component will be truncated."""
1148         off = localtime(t).tm_gmtoff
1149         # Note: // doesn't truncate like C for negative values, it rounds down.
1150         offmin = abs(off) // 60
1151         m = offmin % 60
1152         h = (offmin - m) // 60
1153         return b'%+03d%02d' % (-h if off < 0 else h, m)
1154     def to_py_time(x):
1155         if isinstance(x, time.struct_time):
1156             return x
1157         return time.struct_time(x[:9])
1158 else:
1159     localtime = time.localtime
1160     def utc_offset_str(t):
1161         return time.strftime(b'%z', localtime(t))
1162     def to_py_time(x):
1163         return x
1164
1165
1166 _some_invalid_save_parts_rx = re.compile(br'[\[ ~^:?*\\]|\.\.|//|@{')
1167
1168 def valid_save_name(name):
1169     # Enforce a superset of the restrictions in git-check-ref-format(1)
1170     if name == b'@' \
1171        or name.startswith(b'/') or name.endswith(b'/') \
1172        or name.endswith(b'.'):
1173         return False
1174     if _some_invalid_save_parts_rx.search(name):
1175         return False
1176     for c in name:
1177         if byte_int(c) < 0x20 or byte_int(c) == 0x7f:
1178             return False
1179     for part in name.split(b'/'):
1180         if part.startswith(b'.') or part.endswith(b'.lock'):
1181             return False
1182     return True
1183
1184
1185 _period_rx = re.compile(br'^([0-9]+)(s|min|h|d|w|m|y)$')
1186
1187 def period_as_secs(s):
1188     if s == b'forever':
1189         return float('inf')
1190     match = _period_rx.match(s)
1191     if not match:
1192         return None
1193     mag = int(match.group(1))
1194     scale = match.group(2)
1195     return mag * {b's': 1,
1196                   b'min': 60,
1197                   b'h': 60 * 60,
1198                   b'd': 60 * 60 * 24,
1199                   b'w': 60 * 60 * 24 * 7,
1200                   b'm': 60 * 60 * 24 * 31,
1201                   b'y': 60 * 60 * 24 * 366}[scale]