From fc92822b6c5771ae56afd9472017d5aa7efeb95d Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 9 Aug 2016 17:35:27 +0200 Subject: [PATCH 01/22] explain the confusing TypeError, fixes #1456 --- borg/remote.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/borg/remote.py b/borg/remote.py index 47a20412b..3dda24133 100644 --- a/borg/remote.py +++ b/borg/remote.py @@ -185,6 +185,16 @@ class RemoteRepository: except self.RPCError as err: if err.remote_type != 'TypeError': raise + msg = """\ +Please note: +If you see a TypeError complaining about the number of positional arguments +given to open(), you can ignore it if it comes from a borg version < 1.0.7. +This TypeError is a cosmetic side effect of the compatibility code borg +clients >= 1.0.7 have to support older borg servers. +This problem will go away as soon as the server has been upgraded to 1.0.7+. +""" + # emit this msg in the same way as the "Remote: ..." lines that show the remote TypeError + sys.stderr.write(msg) if append_only: raise self.NoAppendOnlyOnServer() self.id = self.call('open', self.location.path, create, lock_wait, lock) From 2624d6f81858edef821363526b15614960cba736 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 9 Aug 2016 20:30:50 +0200 Subject: [PATCH 02/22] larger item metadata stream chunks, fixes #1452 increasing the mask (target chunk size) from 14 (16kiB) to 17 (128kiB). this should reduce the amount of item metadata chunks an archive has to reference to 1/8. this does not completely fix #1452, but at least enables a 8x larger item metadata stream. --- borg/archive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/borg/archive.py b/borg/archive.py index 653e2dd1c..4498c72a3 100644 --- a/borg/archive.py +++ b/borg/archive.py @@ -39,7 +39,7 @@ HASH_MASK_BITS = 21 # results in ~2MiB chunks statistically CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) # chunker params for the items metadata stream, finer granularity -ITEMS_CHUNKER_PARAMS = (12, 16, 14, HASH_WINDOW_SIZE) +ITEMS_CHUNKER_PARAMS = (15, 19, 17, HASH_WINDOW_SIZE) has_lchmod = hasattr(os, 'lchmod') has_lchflags = hasattr(os, 'lchflags') From 7b5772df2d68b544ac90bf670efcf24ebdf7a6c9 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 10 Aug 2016 13:56:06 +0200 Subject: [PATCH 03/22] add transaction_id assertion an acd_cli (amazon cloud drive fuse filesystem) user had "borg init" crash in the line below. by adding the assertion we tell that we do not expect the transaction_id to be None there, so it is easier to differentiate from a random coding error. --- borg/repository.py | 1 + 1 file changed, 1 insertion(+) diff --git a/borg/repository.py b/borg/repository.py index 686f30a7d..c33f49da9 100644 --- a/borg/repository.py +++ b/borg/repository.py @@ -248,6 +248,7 @@ class Repository: b'segments': self.segments, b'compact': list(self.compact)} transaction_id = self.io.get_segments_transaction_id() + assert transaction_id is not None hints_file = os.path.join(self.path, 'hints.%d' % transaction_id) with open(hints_file + '.tmp', 'wb') as fd: msgpack.pack(hints, fd) From 6e658a5a6cc41f0eac8c71e1ba28e9429cf8508e Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 10 Aug 2016 15:45:57 +0200 Subject: [PATCH 04/22] docs: improve prune examples --- docs/usage.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index 82da1f978..ab92c1cb1 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -423,8 +423,8 @@ you restrict its operation to a subset of the archives using ``--prefix``. When using ``--prefix``, be careful to choose a good prefix - e.g. do not use a prefix "foo" if you do not also want to match "foobar". -It is strongly recommended to always run ``prune --dry-run ...`` first so you -will see what it would do without it actually doing anything. +It is strongly recommended to always run ``prune -v --list --dry-run ...`` +first so you will see what it would do without it actually doing anything. There is also a visualized prune example in ``docs/misc/prune-example.txt``. @@ -432,19 +432,19 @@ There is also a visualized prune example in ``docs/misc/prune-example.txt``. # Keep 7 end of day and 4 additional end of week archives. # Do a dry-run without actually deleting anything. - $ borg prune --dry-run --keep-daily=7 --keep-weekly=4 /path/to/repo + $ borg prune -v --list --dry-run --keep-daily=7 --keep-weekly=4 /path/to/repo # Same as above but only apply to archive names starting with the hostname # of the machine followed by a "-" character: - $ borg prune --keep-daily=7 --keep-weekly=4 --prefix='{hostname}-' /path/to/repo + $ borg prune -v --list --keep-daily=7 --keep-weekly=4 --prefix='{hostname}-' /path/to/repo # Keep 7 end of day, 4 additional end of week archives, # and an end of month archive for every month: - $ borg prune --keep-daily=7 --keep-weekly=4 --keep-monthly=-1 /path/to/repo + $ borg prune -v --list --keep-daily=7 --keep-weekly=4 --keep-monthly=-1 /path/to/repo # Keep all backups in the last 10 days, 4 additional end of week archives, # and an end of month archive for every month: - $ borg prune --keep-within=10d --keep-weekly=4 --keep-monthly=-1 /path/to/repo + $ borg prune -v --list --keep-within=10d --keep-weekly=4 --keep-monthly=-1 /path/to/repo .. include:: usage/info.rst.inc From c61a9e8aa0ce34e07ea738f1ef42c4c4e2812cf7 Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Fri, 12 Aug 2016 13:00:53 +0200 Subject: [PATCH 05/22] Print active env var override by default Fixes #1467 --- borg/helpers.py | 6 +++--- borg/testsuite/helpers.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/borg/helpers.py b/borg/helpers.py index 4275d783e..975ddca81 100644 --- a/borg/helpers.py +++ b/borg/helpers.py @@ -912,7 +912,7 @@ DEFAULTISH = ('Default', 'DEFAULT', 'default', 'D', 'd', '', ) def yes(msg=None, false_msg=None, true_msg=None, default_msg=None, - retry_msg=None, invalid_msg=None, env_msg=None, + retry_msg=None, invalid_msg=None, env_msg='{} (from {})', falsish=FALSISH, truish=TRUISH, defaultish=DEFAULTISH, default=False, retry=True, env_var_override=None, ofile=None, input=input): """Output (usually a question) and let user input an answer. @@ -933,8 +933,8 @@ def yes(msg=None, false_msg=None, true_msg=None, default_msg=None, :param true_msg: message to output before returning True [None] :param default_msg: message to output before returning a [None] :param invalid_msg: message to output after a invalid answer was given [None] - :param env_msg: message to output when using input from env_var_override [None], - needs to have 2 placeholders for answer and env var name, e.g.: "{} (from {})" + :param env_msg: message to output when using input from env_var_override ['{} (from {})'], + needs to have 2 placeholders for answer and env var name :param falsish: sequence of answers qualifying as False :param truish: sequence of answers qualifying as True :param defaultish: sequence of answers qualifying as diff --git a/borg/testsuite/helpers.py b/borg/testsuite/helpers.py index 35993ef12..9e03019df 100644 --- a/borg/testsuite/helpers.py +++ b/borg/testsuite/helpers.py @@ -805,6 +805,16 @@ def test_yes_output(capfd): assert 'false-msg' in err +def test_yes_env_output(capfd, monkeypatch): + env_var = 'OVERRIDE_SOMETHING' + monkeypatch.setenv(env_var, 'yes') + assert yes(env_var_override=env_var) + out, err = capfd.readouterr() + assert out == '' + assert env_var in err + assert 'yes' in err + + def test_progress_percentage_multiline(capfd): pi = ProgressIndicatorPercent(1000, step=5, start=0, same_line=False, msg="%3.0f%%", file=sys.stderr) pi.show(0) From 17c77a5dc51064dfbd4a0ec6129a911ab59279a3 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 11 Aug 2016 15:30:50 +0200 Subject: [PATCH 06/22] xattr: dynamically grow result buffer until it fits, fixes #1462 this also fixes the race condition seen in #1462 because there is only 1 call now. either it succeeds, then we get the correct length as result and truncate the result value to that length. or it fails with ERANGE, then we grow the buffer to double size and repeat. or it fails with some other error, then we throw OSError. --- borg/testsuite/xattr.py | 22 ++++++++++++- borg/xattr.py | 68 ++++++++++++++++++++++++++++++----------- 2 files changed, 72 insertions(+), 18 deletions(-) diff --git a/borg/testsuite/xattr.py b/borg/testsuite/xattr.py index df0130c90..ecc92fa18 100644 --- a/borg/testsuite/xattr.py +++ b/borg/testsuite/xattr.py @@ -2,7 +2,7 @@ import os import tempfile import unittest -from ..xattr import is_enabled, getxattr, setxattr, listxattr +from ..xattr import is_enabled, getxattr, setxattr, listxattr, get_buffer from . import BaseTestCase @@ -38,3 +38,23 @@ class XattrTestCase(BaseTestCase): self.assert_equal(getxattr(self.tmpfile.fileno(), 'user.foo'), b'bar') self.assert_equal(getxattr(self.symlink, 'user.foo'), b'bar') self.assert_equal(getxattr(self.tmpfile.name, 'user.empty'), None) + + def test_listxattr_buffer_growth(self): + # make it work even with ext4, which imposes rather low limits + get_buffer(size=64, init=True) + # xattr raw key list will be size 9 * (10 + 1), which is > 64 + keys = ['user.attr%d' % i for i in range(9)] + for key in keys: + setxattr(self.tmpfile.name, key, b'x') + got_keys = listxattr(self.tmpfile.name) + self.assert_equal_se(got_keys, keys) + self.assert_equal(len(get_buffer()), 128) + + def test_getxattr_buffer_growth(self): + # make it work even with ext4, which imposes rather low limits + get_buffer(size=64, init=True) + value = b'x' * 126 + setxattr(self.tmpfile.name, 'user.big', value) + got_value = getxattr(self.tmpfile.name, 'user.big') + self.assert_equal(value, got_value) + self.assert_equal(len(get_buffer()), 128) diff --git a/borg/xattr.py b/borg/xattr.py index e88d7ce8c..146f3ae76 100644 --- a/borg/xattr.py +++ b/borg/xattr.py @@ -6,6 +6,7 @@ import re import subprocess import sys import tempfile +import threading from ctypes import CDLL, create_string_buffer, c_ssize_t, c_size_t, c_char_p, c_int, c_uint32, get_errno from ctypes.util import find_library from distutils.version import LooseVersion @@ -14,6 +15,18 @@ from .logger import create_logger logger = create_logger() +def get_buffer(size=None, init=False): + if size is not None: + size = int(size) + if init or len(thread_local.buffer) < size: + thread_local.buffer = create_string_buffer(size) + return thread_local.buffer + + +thread_local = threading.local() +get_buffer(size=4096, init=True) + + def is_enabled(path=None): """Determine if xattr is enabled on the filesystem """ @@ -78,9 +91,17 @@ except OSError as e: raise Exception(msg) +class BufferTooSmallError(Exception): + """the buffer given to an xattr function was too small for the result""" + + def _check(rv, path=None): if rv < 0: - raise OSError(get_errno(), path) + e = get_errno() + if e == errno.ERANGE: + raise BufferTooSmallError + else: + raise OSError(e, path) return rv if sys.platform.startswith('linux'): # pragma: linux only @@ -106,14 +127,20 @@ if sys.platform.startswith('linux'): # pragma: linux only func = libc.listxattr else: func = libc.llistxattr - n = _check(func(path, None, 0), path) - if n == 0: - return [] - namebuf = create_string_buffer(n) - n2 = _check(func(path, namebuf, n), path) - if n2 != n: - raise Exception('listxattr failed') - return [os.fsdecode(name) for name in namebuf.raw.split(b'\0')[:-1] if not name.startswith(b'system.posix_acl_')] + size = len(get_buffer()) + while True: + buf = get_buffer(size) + try: + n = _check(func(path, buf, size), path) + except BufferTooSmallError: + size *= 2 + else: + if n == 0: + return [] + names = buf.raw[:n] + return [os.fsdecode(name) + for name in names.split(b'\0')[:-1] + if not name.startswith(b'system.posix_acl_')] def getxattr(path, name, *, follow_symlinks=True): name = os.fsencode(name) @@ -125,14 +152,17 @@ if sys.platform.startswith('linux'): # pragma: linux only func = libc.getxattr else: func = libc.lgetxattr - n = _check(func(path, name, None, 0)) - if n == 0: - return - valuebuf = create_string_buffer(n) - n2 = _check(func(path, name, valuebuf, n), path) - if n2 != n: - raise Exception('getxattr failed') - return valuebuf.raw + size = len(get_buffer()) + while True: + buf = get_buffer(size) + try: + n = _check(func(path, name, buf, size), path) + except BufferTooSmallError: + size *= 2 + else: + if n == 0: + return + return buf.raw[:n] def setxattr(path, name, value, *, follow_symlinks=True): name = os.fsencode(name) @@ -172,6 +202,7 @@ elif sys.platform == 'darwin': # pragma: darwin only func = libc.flistxattr elif not follow_symlinks: flags = XATTR_NOFOLLOW + # TODO: fix, see linux n = _check(func(path, None, 0, flags), path) if n == 0: return [] @@ -191,6 +222,7 @@ elif sys.platform == 'darwin': # pragma: darwin only func = libc.fgetxattr elif not follow_symlinks: flags = XATTR_NOFOLLOW + # TODO: fix, see linux n = _check(func(path, name, None, 0, 0, flags)) if n == 0: return @@ -244,6 +276,7 @@ elif sys.platform.startswith('freebsd'): # pragma: freebsd only func = libc.extattr_list_file else: func = libc.extattr_list_link + # TODO: fix, see linux n = _check(func(path, ns, None, 0), path) if n == 0: return [] @@ -269,6 +302,7 @@ elif sys.platform.startswith('freebsd'): # pragma: freebsd only func = libc.extattr_get_file else: func = libc.extattr_get_link + # TODO: fix, see linux n = _check(func(path, EXTATTR_NAMESPACE_USER, name, None, 0)) if n == 0: return From 67c6c1898cbaf0daf591a8c00749b068b225ac56 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 11 Aug 2016 19:47:04 +0200 Subject: [PATCH 07/22] xattr: refactor code, deduplicate this code would be otherwise duplicated 3 times for linux, freebsd, darwin. --- borg/xattr.py | 282 ++++++++++++++++++++++++++------------------------ 1 file changed, 145 insertions(+), 137 deletions(-) diff --git a/borg/xattr.py b/borg/xattr.py index 146f3ae76..94df6b05c 100644 --- a/borg/xattr.py +++ b/borg/xattr.py @@ -46,6 +46,7 @@ def get_all(path, follow_symlinks=True): if e.errno in (errno.ENOTSUP, errno.EPERM): return {} + libc_name = find_library('c') if libc_name is None: # find_library didn't work, maybe we are on some minimal system that misses essential @@ -104,6 +105,46 @@ def _check(rv, path=None): raise OSError(e, path) return rv + +def _listxattr_inner(func, path): + if isinstance(path, str): + path = os.fsencode(path) + size = len(get_buffer()) + while True: + buf = get_buffer(size) + try: + n = _check(func(path, buf, size), path) + except BufferTooSmallError: + size *= 2 + assert size < 2 ** 20 + else: + return n, buf.raw + + +def _getxattr_inner(func, path, name): + if isinstance(path, str): + path = os.fsencode(path) + name = os.fsencode(name) + size = len(get_buffer()) + while True: + buf = get_buffer(size) + try: + n = _check(func(path, name, buf, size), path) + except BufferTooSmallError: + size *= 2 + else: + return n, buf.raw + + +def _setxattr_inner(func, path, name, value): + if isinstance(path, str): + path = os.fsencode(path) + name = os.fsencode(name) + value = value and os.fsencode(value) + size = len(value) if value else 0 + _check(func(path, name, value, size), path) + + if sys.platform.startswith('linux'): # pragma: linux only libc.llistxattr.argtypes = (c_char_p, c_char_p, c_size_t) libc.llistxattr.restype = c_ssize_t @@ -119,63 +160,49 @@ if sys.platform.startswith('linux'): # pragma: linux only libc.fgetxattr.restype = c_ssize_t def listxattr(path, *, follow_symlinks=True): - if isinstance(path, str): - path = os.fsencode(path) - if isinstance(path, int): - func = libc.flistxattr - elif follow_symlinks: - func = libc.listxattr - else: - func = libc.llistxattr - size = len(get_buffer()) - while True: - buf = get_buffer(size) - try: - n = _check(func(path, buf, size), path) - except BufferTooSmallError: - size *= 2 + def func(path, buf, size): + if isinstance(path, int): + return libc.flistxattr(path, buf, size) else: - if n == 0: - return [] - names = buf.raw[:n] - return [os.fsdecode(name) - for name in names.split(b'\0')[:-1] - if not name.startswith(b'system.posix_acl_')] + if follow_symlinks: + return libc.listxattr(path, buf, size) + else: + return libc.llistxattr(path, buf, size) + + n, buf = _listxattr_inner(func, path) + if n == 0: + return [] + names = buf[:n].split(b'\0')[:-1] + return [os.fsdecode(name) for name in names + if not name.startswith(b'system.posix_acl_')] def getxattr(path, name, *, follow_symlinks=True): - name = os.fsencode(name) - if isinstance(path, str): - path = os.fsencode(path) - if isinstance(path, int): - func = libc.fgetxattr - elif follow_symlinks: - func = libc.getxattr - else: - func = libc.lgetxattr - size = len(get_buffer()) - while True: - buf = get_buffer(size) - try: - n = _check(func(path, name, buf, size), path) - except BufferTooSmallError: - size *= 2 + def func(path, name, buf, size): + if isinstance(path, int): + return libc.fgetxattr(path, name, buf, size) else: - if n == 0: - return - return buf.raw[:n] + if follow_symlinks: + return libc.getxattr(path, name, buf, size) + else: + return libc.lgetxattr(path, name, buf, size) + + n, buf = _getxattr_inner(func, path, name) + if n == 0: + return + return buf[:n] def setxattr(path, name, value, *, follow_symlinks=True): - name = os.fsencode(name) - value = value and os.fsencode(value) - if isinstance(path, str): - path = os.fsencode(path) - if isinstance(path, int): - func = libc.fsetxattr - elif follow_symlinks: - func = libc.setxattr - else: - func = libc.lsetxattr - _check(func(path, name, value, len(value) if value else 0, 0), path) + def func(path, name, value, size): + flags = 0 + if isinstance(path, int): + return libc.fsetxattr(path, name, value, size, flags) + else: + if follow_symlinks: + return libc.setxattr(path, name, value, size, flags) + else: + return libc.lsetxattr(path, name, value, size, flags) + + _setxattr_inner(func, path, name, value) elif sys.platform == 'darwin': # pragma: darwin only libc.listxattr.argtypes = (c_char_p, c_char_p, c_size_t, c_int) @@ -191,62 +218,53 @@ elif sys.platform == 'darwin': # pragma: darwin only libc.fgetxattr.argtypes = (c_int, c_char_p, c_char_p, c_size_t, c_uint32, c_int) libc.fgetxattr.restype = c_ssize_t + XATTR_NOFLAGS = 0x0000 XATTR_NOFOLLOW = 0x0001 def listxattr(path, *, follow_symlinks=True): - func = libc.listxattr - flags = 0 - if isinstance(path, str): - path = os.fsencode(path) - if isinstance(path, int): - func = libc.flistxattr - elif not follow_symlinks: - flags = XATTR_NOFOLLOW - # TODO: fix, see linux - n = _check(func(path, None, 0, flags), path) + def func(path, buf, size): + if isinstance(path, int): + return libc.flistxattr(path, buf, size, XATTR_NOFLAGS) + else: + if follow_symlinks: + return libc.listxattr(path, buf, size, XATTR_NOFLAGS) + else: + return libc.listxattr(path, buf, size, XATTR_NOFOLLOW) + + n, buf = _listxattr_inner(func, path) if n == 0: return [] - namebuf = create_string_buffer(n) - n2 = _check(func(path, namebuf, n, flags), path) - if n2 != n: - raise Exception('listxattr failed') - return [os.fsdecode(name) for name in namebuf.raw.split(b'\0')[:-1]] + names = buf[:n].split(b'\0')[:-1] + return [os.fsdecode(name) for name in names] def getxattr(path, name, *, follow_symlinks=True): - name = os.fsencode(name) - func = libc.getxattr - flags = 0 - if isinstance(path, str): - path = os.fsencode(path) - if isinstance(path, int): - func = libc.fgetxattr - elif not follow_symlinks: - flags = XATTR_NOFOLLOW - # TODO: fix, see linux - n = _check(func(path, name, None, 0, 0, flags)) + def func(path, name, buf, size): + if isinstance(path, int): + return libc.fgetxattr(path, name, buf, size, 0, XATTR_NOFLAGS) + else: + if follow_symlinks: + return libc.getxattr(path, name, buf, size, 0, XATTR_NOFLAGS) + else: + return libc.getxattr(path, name, buf, size, 0, XATTR_NOFOLLOW) + + n, buf = _getxattr_inner(func, path, name) if n == 0: return - valuebuf = create_string_buffer(n) - n2 = _check(func(path, name, valuebuf, n, 0, flags), path) - if n2 != n: - raise Exception('getxattr failed') - return valuebuf.raw + return buf[:n] def setxattr(path, name, value, *, follow_symlinks=True): - name = os.fsencode(name) - value = value and os.fsencode(value) - func = libc.setxattr - flags = 0 - if isinstance(path, str): - path = os.fsencode(path) - if isinstance(path, int): - func = libc.fsetxattr - elif not follow_symlinks: - flags = XATTR_NOFOLLOW - _check(func(path, name, value, len(value) if value else 0, 0, flags), path) + def func(path, name, value, size): + if isinstance(path, int): + return libc.fsetxattr(path, name, value, size, 0, XATTR_NOFLAGS) + else: + if follow_symlinks: + return libc.setxattr(path, name, value, size, 0, XATTR_NOFLAGS) + else: + return libc.setxattr(path, name, value, size, 0, XATTR_NOFOLLOW) + + _setxattr_inner(func, path, name, value) elif sys.platform.startswith('freebsd'): # pragma: freebsd only - EXTATTR_NAMESPACE_USER = 0x0001 libc.extattr_list_fd.argtypes = (c_int, c_int, c_char_p, c_size_t) libc.extattr_list_fd.restype = c_ssize_t libc.extattr_list_link.argtypes = (c_char_p, c_int, c_char_p, c_size_t) @@ -265,27 +283,23 @@ elif sys.platform.startswith('freebsd'): # pragma: freebsd only libc.extattr_set_link.restype = c_int libc.extattr_set_file.argtypes = (c_char_p, c_int, c_char_p, c_char_p, c_size_t) libc.extattr_set_file.restype = c_int + ns = EXTATTR_NAMESPACE_USER = 0x0001 def listxattr(path, *, follow_symlinks=True): - ns = EXTATTR_NAMESPACE_USER - if isinstance(path, str): - path = os.fsencode(path) - if isinstance(path, int): - func = libc.extattr_list_fd - elif follow_symlinks: - func = libc.extattr_list_file - else: - func = libc.extattr_list_link - # TODO: fix, see linux - n = _check(func(path, ns, None, 0), path) + def func(path, buf, size): + if isinstance(path, int): + return libc.extattr_list_fd(path, ns, buf, size) + else: + if follow_symlinks: + return libc.extattr_list_file(path, ns, buf, size) + else: + return libc.extattr_list_link(path, ns, buf, size) + + n, buf = _listxattr_inner(func, path) if n == 0: return [] - namebuf = create_string_buffer(n) - n2 = _check(func(path, ns, namebuf, n), path) - if n2 != n: - raise Exception('listxattr failed') names = [] - mv = memoryview(namebuf.raw) + mv = memoryview(buf) while mv: length = mv[0] names.append(os.fsdecode(bytes(mv[1:1 + length]))) @@ -293,37 +307,31 @@ elif sys.platform.startswith('freebsd'): # pragma: freebsd only return names def getxattr(path, name, *, follow_symlinks=True): - name = os.fsencode(name) - if isinstance(path, str): - path = os.fsencode(path) - if isinstance(path, int): - func = libc.extattr_get_fd - elif follow_symlinks: - func = libc.extattr_get_file - else: - func = libc.extattr_get_link - # TODO: fix, see linux - n = _check(func(path, EXTATTR_NAMESPACE_USER, name, None, 0)) + def func(path, name, buf, size): + if isinstance(path, int): + return libc.extattr_get_fd(path, ns, name, buf, size) + else: + if follow_symlinks: + return libc.extattr_get_file(path, ns, name, buf, size) + else: + return libc.extattr_get_link(path, ns, name, buf, size) + + n, buf = _getxattr_inner(func, path, name) if n == 0: return - valuebuf = create_string_buffer(n) - n2 = _check(func(path, EXTATTR_NAMESPACE_USER, name, valuebuf, n), path) - if n2 != n: - raise Exception('getxattr failed') - return valuebuf.raw + return buf[:n] def setxattr(path, name, value, *, follow_symlinks=True): - name = os.fsencode(name) - value = value and os.fsencode(value) - if isinstance(path, str): - path = os.fsencode(path) - if isinstance(path, int): - func = libc.extattr_set_fd - elif follow_symlinks: - func = libc.extattr_set_file - else: - func = libc.extattr_set_link - _check(func(path, EXTATTR_NAMESPACE_USER, name, value, len(value) if value else 0), path) + def func(path, name, value, size): + if isinstance(path, int): + return libc.extattr_set_fd(path, ns, name, value, size) + else: + if follow_symlinks: + return libc.extattr_set_file(path, ns, name, value, size) + else: + return libc.extattr_set_link(path, ns, name, value, size) + + _setxattr_inner(func, path, name, value) else: # pragma: unknown platform only def listxattr(path, *, follow_symlinks=True): From 09dbec99a05040244f1e4aca908552ba07a42052 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 11 Aug 2016 20:11:36 +0200 Subject: [PATCH 08/22] raise OSError including the error message derived from errno also: deal with path being a integer FD --- borg/xattr.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/borg/xattr.py b/borg/xattr.py index 94df6b05c..82a3afb4e 100644 --- a/borg/xattr.py +++ b/borg/xattr.py @@ -102,7 +102,13 @@ def _check(rv, path=None): if e == errno.ERANGE: raise BufferTooSmallError else: - raise OSError(e, path) + try: + msg = os.strerror(e) + except ValueError: + msg = '' + if isinstance(path, int): + path = '' % path + raise OSError(e, msg, path) return rv From 418794f66f9b577de67a9ba15618738aa81ba626 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 11 Aug 2016 21:52:48 +0200 Subject: [PATCH 09/22] xattr: errno ERANGE has different meanings --- borg/xattr.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/borg/xattr.py b/borg/xattr.py index 82a3afb4e..2649eadb0 100644 --- a/borg/xattr.py +++ b/borg/xattr.py @@ -18,6 +18,7 @@ logger = create_logger() def get_buffer(size=None, init=False): if size is not None: size = int(size) + assert size < 2 ** 24 if init or len(thread_local.buffer) < size: thread_local.buffer = create_string_buffer(size) return thread_local.buffer @@ -96,10 +97,12 @@ class BufferTooSmallError(Exception): """the buffer given to an xattr function was too small for the result""" -def _check(rv, path=None): +def _check(rv, path=None, detect_buffer_too_small=False): if rv < 0: e = get_errno() - if e == errno.ERANGE: + if detect_buffer_too_small and e == errno.ERANGE: + # listxattr and getxattr signal with ERANGE that they need a bigger result buffer. + # setxattr signals this way that e.g. a xattr key name is too long / inacceptable. raise BufferTooSmallError else: try: @@ -119,10 +122,9 @@ def _listxattr_inner(func, path): while True: buf = get_buffer(size) try: - n = _check(func(path, buf, size), path) + n = _check(func(path, buf, size), path, detect_buffer_too_small=True) except BufferTooSmallError: size *= 2 - assert size < 2 ** 20 else: return n, buf.raw @@ -135,7 +137,7 @@ def _getxattr_inner(func, path, name): while True: buf = get_buffer(size) try: - n = _check(func(path, name, buf, size), path) + n = _check(func(path, name, buf, size), path, detect_buffer_too_small=True) except BufferTooSmallError: size *= 2 else: @@ -148,7 +150,7 @@ def _setxattr_inner(func, path, name, value): name = os.fsencode(name) value = value and os.fsencode(value) size = len(value) if value else 0 - _check(func(path, name, value, size), path) + _check(func(path, name, value, size), path, detect_buffer_too_small=False) if sys.platform.startswith('linux'): # pragma: linux only From 4eac66fe2aa7c18569703ef10149c9171dc784a7 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 11 Aug 2016 22:47:12 +0200 Subject: [PATCH 10/22] xattr: fix race condition in get_all(), see issue #906 --- borg/xattr.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/borg/xattr.py b/borg/xattr.py index 2649eadb0..0439b193c 100644 --- a/borg/xattr.py +++ b/borg/xattr.py @@ -15,6 +15,13 @@ from .logger import create_logger logger = create_logger() +try: + ENOATTR = errno.ENOATTR +except AttributeError: + # on some platforms, ENOATTR is missing, use ENODATA there + ENOATTR = errno.ENODATA + + def get_buffer(size=None, init=False): if size is not None: size = int(size) @@ -41,8 +48,17 @@ def is_enabled(path=None): def get_all(path, follow_symlinks=True): try: - return dict((name, getxattr(path, name, follow_symlinks=follow_symlinks)) - for name in listxattr(path, follow_symlinks=follow_symlinks)) + result = {} + names = listxattr(path, follow_symlinks=follow_symlinks) + for name in names: + try: + result[name] = getxattr(path, name, follow_symlinks=follow_symlinks) + except OSError as e: + # if we get ENOATTR, a race has happened: xattr names were deleted after list. + # we just ignore the now missing ones. if you want consistency, do snapshots. + if e.errno != ENOATTR: + raise + return result except OSError as e: if e.errno in (errno.ENOTSUP, errno.EPERM): return {} From 7ea052a5e80ece5fd70640c918d6bcb8f50d3a5a Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 12 Aug 2016 02:55:49 +0200 Subject: [PATCH 11/22] xattr: buffer full check for freebsd freebsd 10.2: it does not give rc < 0 and errno == ERANGE if the buffer was too small, like linux or mac OS X does. rv == buffer len might be a signal of truncation. rv > buffer len would be even worse not sure if some implementation returns the total length of the data, not just the amount put into the buffer. but as we use the returned length to "truncate" the buffer, we better make sure it is not longer than the buffer. also: freebsd listxattr memoryview len bugfix --- borg/xattr.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/borg/xattr.py b/borg/xattr.py index 0439b193c..99f7657bb 100644 --- a/borg/xattr.py +++ b/borg/xattr.py @@ -128,6 +128,12 @@ def _check(rv, path=None, detect_buffer_too_small=False): if isinstance(path, int): path = '' % path raise OSError(e, msg, path) + if detect_buffer_too_small and rv >= len(get_buffer()): + # freebsd does not error with ERANGE if the buffer is too small, + # it just fills the buffer, truncates and returns. + # so, we play sure and just assume that result is truncated if + # it happens to be a full buffer. + raise BufferTooSmallError return rv @@ -323,7 +329,7 @@ elif sys.platform.startswith('freebsd'): # pragma: freebsd only if n == 0: return [] names = [] - mv = memoryview(buf) + mv = memoryview(buf)[:n] while mv: length = mv[0] names.append(os.fsdecode(bytes(mv[1:1 + length]))) From b6ead3dce2cabd07b5fdf5f50a228f8e76108c5c Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 12 Aug 2016 04:17:27 +0200 Subject: [PATCH 12/22] xattr: use some string processing functions, dedup, simplify --- borg/xattr.py | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/borg/xattr.py b/borg/xattr.py index 99f7657bb..0099a2ad9 100644 --- a/borg/xattr.py +++ b/borg/xattr.py @@ -109,6 +109,22 @@ except OSError as e: raise Exception(msg) +def split_string0(buf): + """split a list of zero-terminated strings into python not-zero-terminated bytes""" + return buf.split(b'\0')[:-1] + + +def split_lstring(buf): + """split a list of length-prefixed strings into python not-length-prefixed bytes""" + result = [] + mv = memoryview(buf) + while mv: + length = mv[0] + result.append(bytes(mv[1:1 + length])) + mv = mv[1 + length:] + return result + + class BufferTooSmallError(Exception): """the buffer given to an xattr function was too small for the result""" @@ -200,10 +216,7 @@ if sys.platform.startswith('linux'): # pragma: linux only return libc.llistxattr(path, buf, size) n, buf = _listxattr_inner(func, path) - if n == 0: - return [] - names = buf[:n].split(b'\0')[:-1] - return [os.fsdecode(name) for name in names + return [os.fsdecode(name) for name in split_string0(buf[:n]) if not name.startswith(b'system.posix_acl_')] def getxattr(path, name, *, follow_symlinks=True): @@ -217,9 +230,7 @@ if sys.platform.startswith('linux'): # pragma: linux only return libc.lgetxattr(path, name, buf, size) n, buf = _getxattr_inner(func, path, name) - if n == 0: - return - return buf[:n] + return buf[:n] or None def setxattr(path, name, value, *, follow_symlinks=True): def func(path, name, value, size): @@ -262,10 +273,7 @@ elif sys.platform == 'darwin': # pragma: darwin only return libc.listxattr(path, buf, size, XATTR_NOFOLLOW) n, buf = _listxattr_inner(func, path) - if n == 0: - return [] - names = buf[:n].split(b'\0')[:-1] - return [os.fsdecode(name) for name in names] + return [os.fsdecode(name) for name in split_string0(buf[:n])] def getxattr(path, name, *, follow_symlinks=True): def func(path, name, buf, size): @@ -278,9 +286,7 @@ elif sys.platform == 'darwin': # pragma: darwin only return libc.getxattr(path, name, buf, size, 0, XATTR_NOFOLLOW) n, buf = _getxattr_inner(func, path, name) - if n == 0: - return - return buf[:n] + return buf[:n] or None def setxattr(path, name, value, *, follow_symlinks=True): def func(path, name, value, size): @@ -326,15 +332,7 @@ elif sys.platform.startswith('freebsd'): # pragma: freebsd only return libc.extattr_list_link(path, ns, buf, size) n, buf = _listxattr_inner(func, path) - if n == 0: - return [] - names = [] - mv = memoryview(buf)[:n] - while mv: - length = mv[0] - names.append(os.fsdecode(bytes(mv[1:1 + length]))) - mv = mv[1 + length:] - return names + return [os.fsdecode(name) for name in split_lstring(buf[:n])] def getxattr(path, name, *, follow_symlinks=True): def func(path, name, buf, size): @@ -347,9 +345,7 @@ elif sys.platform.startswith('freebsd'): # pragma: freebsd only return libc.extattr_get_link(path, ns, name, buf, size) n, buf = _getxattr_inner(func, path, name) - if n == 0: - return - return buf[:n] + return buf[:n] or None def setxattr(path, name, value, *, follow_symlinks=True): def func(path, name, value, size): From 8630ebf3f0bd1a5743f9d8507b12d6661c1b875b Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 12 Aug 2016 04:21:21 +0200 Subject: [PATCH 13/22] xattr: fix module docstring --- borg/xattr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/borg/xattr.py b/borg/xattr.py index 0099a2ad9..7a4a7ae92 100644 --- a/borg/xattr.py +++ b/borg/xattr.py @@ -1,5 +1,5 @@ -"""A basic extended attributes (xattr) implementation for Linux and MacOS X -""" +"""A basic extended attributes (xattr) implementation for Linux, FreeBSD and MacOS X.""" + import errno import os import re From c834b2969cea159cf1a163666381dec63c73ebcf Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 12 Aug 2016 17:54:15 +0200 Subject: [PATCH 14/22] document archive limitation, #1452 --- docs/faq.rst | 11 +++++++++++ docs/internals.rst | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/docs/faq.rst b/docs/faq.rst index 88418b180..c772f5fa7 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -62,6 +62,17 @@ Which file types, attributes, etc. are *not* preserved? holes in a sparse file. * filesystem specific attributes, like ext4 immutable bit, see :issue:`618`. +Are there other known limitations? +---------------------------------- + +- A single archive can only reference a limited volume of file/dir metadata, + usually corresponding to tens or hundreds of millions of files/dirs. + When trying to go beyond that limit, you will get a fatal IntegrityError + exception telling that the (archive) object is too big. + An easy workaround is to create multiple archives with less items each. + See also the :ref:`archive_limitation` and :issue:`1452`. + + Why is my backup bigger than with attic? Why doesn't |project_name| do compression by default? ---------------------------------------------------------------------------------------------- diff --git a/docs/internals.rst b/docs/internals.rst index b088f68eb..82be188bf 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -160,12 +160,40 @@ object that contains: * version * name -* list of chunks containing item metadata +* list of chunks containing item metadata (size: count * ~40B) * cmdline * hostname * username * time +.. _archive_limitation: + +Note about archive limitations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The archive is currently stored as a single object in the repository +and thus limited in size to MAX_OBJECT_SIZE (20MiB). + +As one chunk list entry is ~40B, that means we can reference ~500.000 item +metadata stream chunks per archive. + +Each item metadata stream chunk is ~128kiB (see hardcoded ITEMS_CHUNKER_PARAMS). + +So that means the whole item metadata stream is limited to ~64GiB chunks. +If compression is used, the amount of storable metadata is bigger - by the +compression factor. + +If the medium size of an item entry is 100B (small size file, no ACLs/xattrs), +that means a limit of ~640 million files/directories per archive. + +If the medium size of an item entry is 2kB (~100MB size files or more +ACLs/xattrs), the limit will be ~32 million files/directories per archive. + +If one tries to create an archive object bigger than MAX_OBJECT_SIZE, a fatal +IntegrityError will be raised. + +A workaround is to create multiple archives with less items each, see +also :issue:`1452`. The Item -------- @@ -174,7 +202,7 @@ Each item represents a file, directory or other fs item and is stored as an ``item`` dictionary that contains: * path -* list of data chunks +* list of data chunks (size: count * ~40B) * user * group * uid From 3c7dddcb99586c2e7ac791177598fa99e2fb459f Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 12 Aug 2016 18:00:50 +0200 Subject: [PATCH 15/22] update changelog --- docs/changes.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/changes.rst b/docs/changes.rst index 305be063c..4972dc382 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -57,6 +57,10 @@ Security fixes: - fix security issue with remote repository access, #1428 + +Version 1.0.7rc2 (not released yet) +----------------------------------- + Bug fixes: - do not write objects to repository that are bigger than the allowed size, @@ -64,6 +68,11 @@ Bug fixes: IMPORTANT: if you created archives with many millions of files or directories, please verify if you can open them successfully, e.g. try a "borg list REPO::ARCHIVE". +- fixed a race condition in extended attributes querying that led to the + entire file not being backed up (while logging the error, exit code = 1), + #1469 +- fixed a race condition in extended attributes querying that led to a crash, + #1462 Version 1.0.7rc1 (2016-08-05) From ef9e8a584bdddcef707546be7b6c9457dd282af4 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 12 Aug 2016 19:34:29 +0200 Subject: [PATCH 16/22] refactor buffer code into helpers.Buffer class, add tests --- borg/helpers.py | 42 ++++++++++++++++++++++++++++ borg/testsuite/helpers.py | 58 ++++++++++++++++++++++++++++++++++++++- borg/testsuite/xattr.py | 10 +++---- borg/xattr.py | 25 ++++++----------- 4 files changed, 112 insertions(+), 23 deletions(-) diff --git a/borg/helpers.py b/borg/helpers.py index 975ddca81..27b3f0d3a 100644 --- a/borg/helpers.py +++ b/borg/helpers.py @@ -10,6 +10,7 @@ import re from shutil import get_terminal_size import sys import platform +import threading import time import unicodedata import io @@ -655,6 +656,47 @@ def memoize(function): return decorated_function +class Buffer: + """ + provide a thread-local buffer + """ + def __init__(self, allocator, size=4096, limit=None): + """ + Initialize the buffer: use allocator(size) call to allocate a buffer. + Optionally, set the upper for the buffer size. + """ + assert callable(allocator), 'must give alloc(size) function as first param' + assert limit is None or size <= limit, 'initial size must be <= limit' + self._thread_local = threading.local() + self.allocator = allocator + self.limit = limit + self.resize(size, init=True) + + def __len__(self): + return len(self._thread_local.buffer) + + def resize(self, size, init=False): + """ + resize the buffer - to avoid frequent reallocation, we usually always grow (if needed). + giving init=True it is possible to first-time initialize or shrink the buffer. + if a buffer size beyond the limit is requested, raise ValueError. + """ + size = int(size) + if self.limit is not None and size > self.limit: + raise ValueError('Requested buffer size %d is above the limit of %d.' % (size, self.limit)) + if init or len(self) < size: + self._thread_local.buffer = self.allocator(size) + + def get(self, size=None, init=False): + """ + return a buffer of at least the requested size (None: any current size). + init=True can be given to trigger shrinking of the buffer to the given size. + """ + if size is not None: + self.resize(size, init) + return self._thread_local.buffer + + @memoize def uid2user(uid, default=None): try: diff --git a/borg/testsuite/helpers.py b/borg/testsuite/helpers.py index 9e03019df..cbe4cd9d6 100644 --- a/borg/testsuite/helpers.py +++ b/borg/testsuite/helpers.py @@ -15,7 +15,8 @@ from ..helpers import Location, format_file_size, format_timedelta, format_line, yes, TRUISH, FALSISH, DEFAULTISH, \ StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams, \ ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern, \ - PatternMatcher, RegexPattern, PathPrefixPattern, FnmatchPattern, ShellPattern + PatternMatcher, RegexPattern, PathPrefixPattern, FnmatchPattern, ShellPattern, \ + Buffer from . import BaseTestCase, environment_variable, FakeInputs @@ -714,6 +715,61 @@ def test_is_slow_msgpack(): assert not is_slow_msgpack() +class TestBuffer: + def test_type(self): + buffer = Buffer(bytearray) + assert isinstance(buffer.get(), bytearray) + buffer = Buffer(bytes) # don't do that in practice + assert isinstance(buffer.get(), bytes) + + def test_len(self): + buffer = Buffer(bytearray, size=0) + b = buffer.get() + assert len(buffer) == len(b) == 0 + buffer = Buffer(bytearray, size=1234) + b = buffer.get() + assert len(buffer) == len(b) == 1234 + + def test_resize(self): + buffer = Buffer(bytearray, size=100) + assert len(buffer) == 100 + b1 = buffer.get() + buffer.resize(200) + assert len(buffer) == 200 + b2 = buffer.get() + assert b2 is not b1 # new, bigger buffer + buffer.resize(100) + assert len(buffer) >= 100 + b3 = buffer.get() + assert b3 is b2 # still same buffer (200) + buffer.resize(100, init=True) + assert len(buffer) == 100 # except on init + b4 = buffer.get() + assert b4 is not b3 # new, smaller buffer + + def test_limit(self): + buffer = Buffer(bytearray, size=100, limit=200) + buffer.resize(200) + assert len(buffer) == 200 + with pytest.raises(ValueError): + buffer.resize(201) + assert len(buffer) == 200 + + def test_get(self): + buffer = Buffer(bytearray, size=100, limit=200) + b1 = buffer.get(50) + assert len(b1) >= 50 # == 100 + b2 = buffer.get(100) + assert len(b2) >= 100 # == 100 + assert b2 is b1 # did not need resizing yet + b3 = buffer.get(200) + assert len(b3) == 200 + assert b3 is not b2 # new, resized buffer + with pytest.raises(ValueError): + buffer.get(201) # beyond limit + assert len(buffer) == 200 + + def test_yes_input(): inputs = list(TRUISH) input = FakeInputs(inputs) diff --git a/borg/testsuite/xattr.py b/borg/testsuite/xattr.py index ecc92fa18..5693c753e 100644 --- a/borg/testsuite/xattr.py +++ b/borg/testsuite/xattr.py @@ -2,7 +2,7 @@ import os import tempfile import unittest -from ..xattr import is_enabled, getxattr, setxattr, listxattr, get_buffer +from ..xattr import is_enabled, getxattr, setxattr, listxattr, buffer from . import BaseTestCase @@ -41,20 +41,20 @@ class XattrTestCase(BaseTestCase): def test_listxattr_buffer_growth(self): # make it work even with ext4, which imposes rather low limits - get_buffer(size=64, init=True) + buffer.resize(size=64, init=True) # xattr raw key list will be size 9 * (10 + 1), which is > 64 keys = ['user.attr%d' % i for i in range(9)] for key in keys: setxattr(self.tmpfile.name, key, b'x') got_keys = listxattr(self.tmpfile.name) self.assert_equal_se(got_keys, keys) - self.assert_equal(len(get_buffer()), 128) + self.assert_equal(len(buffer), 128) def test_getxattr_buffer_growth(self): # make it work even with ext4, which imposes rather low limits - get_buffer(size=64, init=True) + buffer.resize(size=64, init=True) value = b'x' * 126 setxattr(self.tmpfile.name, 'user.big', value) got_value = getxattr(self.tmpfile.name, 'user.big') self.assert_equal(value, got_value) - self.assert_equal(len(get_buffer()), 128) + self.assert_equal(len(buffer), 128) diff --git a/borg/xattr.py b/borg/xattr.py index 7a4a7ae92..10f99ae66 100644 --- a/borg/xattr.py +++ b/borg/xattr.py @@ -6,11 +6,12 @@ import re import subprocess import sys import tempfile -import threading from ctypes import CDLL, create_string_buffer, c_ssize_t, c_size_t, c_char_p, c_int, c_uint32, get_errno from ctypes.util import find_library from distutils.version import LooseVersion +from .helpers import Buffer + from .logger import create_logger logger = create_logger() @@ -22,17 +23,7 @@ except AttributeError: ENOATTR = errno.ENODATA -def get_buffer(size=None, init=False): - if size is not None: - size = int(size) - assert size < 2 ** 24 - if init or len(thread_local.buffer) < size: - thread_local.buffer = create_string_buffer(size) - return thread_local.buffer - - -thread_local = threading.local() -get_buffer(size=4096, init=True) +buffer = Buffer(create_string_buffer, limit=2**24) def is_enabled(path=None): @@ -144,7 +135,7 @@ def _check(rv, path=None, detect_buffer_too_small=False): if isinstance(path, int): path = '' % path raise OSError(e, msg, path) - if detect_buffer_too_small and rv >= len(get_buffer()): + if detect_buffer_too_small and rv >= len(buffer): # freebsd does not error with ERANGE if the buffer is too small, # it just fills the buffer, truncates and returns. # so, we play sure and just assume that result is truncated if @@ -156,9 +147,9 @@ def _check(rv, path=None, detect_buffer_too_small=False): def _listxattr_inner(func, path): if isinstance(path, str): path = os.fsencode(path) - size = len(get_buffer()) + size = len(buffer) while True: - buf = get_buffer(size) + buf = buffer.get(size) try: n = _check(func(path, buf, size), path, detect_buffer_too_small=True) except BufferTooSmallError: @@ -171,9 +162,9 @@ def _getxattr_inner(func, path, name): if isinstance(path, str): path = os.fsencode(path) name = os.fsencode(name) - size = len(get_buffer()) + size = len(buffer) while True: - buf = get_buffer(size) + buf = buffer.get(size) try: n = _check(func(path, name, buf, size), path, detect_buffer_too_small=True) except BufferTooSmallError: From e1bc7b62f6114dd4960ceaaebd5310dd29de91dd Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 12 Aug 2016 21:10:46 +0200 Subject: [PATCH 17/22] lz4: reuse helpers.Buffer --- borg/compress.pyx | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/borg/compress.pyx b/borg/compress.pyx index 13955a86b..f0dd0b1b3 100644 --- a/borg/compress.pyx +++ b/borg/compress.pyx @@ -1,25 +1,18 @@ -import threading import zlib try: import lzma except ImportError: lzma = None +from .helpers import Buffer + cdef extern from "lz4.h": int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil int LZ4_compressBound(int inputSize) nogil -thread_local = threading.local() -thread_local.buffer = bytes() - - -cdef char *get_buffer(size): - size = int(size) - if len(thread_local.buffer) < size: - thread_local.buffer = bytes(size) - return thread_local.buffer +buffer = Buffer(bytearray, size=0) cdef class CompressorBase: @@ -88,7 +81,8 @@ class LZ4(CompressorBase): cdef char *source = idata cdef char *dest osize = LZ4_compressBound(isize) - dest = get_buffer(osize) + buf = buffer.get(osize) + dest = buf with nogil: osize = LZ4_compress_limitedOutput(source, dest, isize, osize) if not osize: @@ -108,7 +102,8 @@ class LZ4(CompressorBase): # allocate more if isize * 3 is already bigger, to avoid having to resize often. osize = max(int(1.1 * 2**23), isize * 3) while True: - dest = get_buffer(osize) + buf = buffer.get(osize) + dest = buf with nogil: rsize = LZ4_decompress_safe(source, dest, isize, osize) if rsize >= 0: From 95cf337fa54ad5d83efa96651aa6355ce53a8006 Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Mon, 8 Aug 2016 14:49:25 +0200 Subject: [PATCH 18/22] Fix untracked segments made by moved DELETEs Fixes #1442 (note that the segments _still_ get generated, see the comment, they should be collected now on the next compaction run) --- borg/repository.py | 45 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/borg/repository.py b/borg/repository.py index c33f49da9..40d73042f 100644 --- a/borg/repository.py +++ b/borg/repository.py @@ -292,6 +292,8 @@ class Repository: self.io.delete_segment(segment) unused = [] + # The first segment compaction creates, if any + first_new_segment = self.io.get_latest_segment() + 1 for segment in sorted(self.compact): if self.io.segment_exists(segment): for tag, key, offset, data in self.io.iter_objects(segment, include_data=True): @@ -307,15 +309,52 @@ class Repository: segments[segment] -= 1 elif tag == TAG_DELETE: if index_transaction_id is None or segment > index_transaction_id: + # (introduced in 6425d16aa84be1eaaf88) + # This is needed to avoid object un-deletion if we crash between the commit and the deletion + # of old segments in complete_xfer(). + # + # However, this only happens if the crash also affects the FS to the effect that file deletions + # did not materialize consistently after journal recovery. If they always materialize in-order + # then this is not a problem, because the old segment containing a deleted object would be deleted + # before the segment containing the delete. + # + # Consider the following series of operations if we would not do this, ie. this entire if: + # would be removed. + # Columns are segments, lines are different keys (line 1 = some key, line 2 = some other key) + # Legend: P=TAG_PUT, D=TAG_DELETE, c=commit, i=index is written for latest commit + # + # Segment | 1 | 2 | 3 + # --------+-------+-----+------ + # Key 1 | P | D | + # Key 2 | P | | P + # commits | c i | c | c i + # --------+-------+-----+------ + # ^- compact_segments starts + # ^- complete_xfer commits, after that complete_xfer deletes + # segments 1 and 2 (and then the index would be written). + # + # Now we crash. But only segment 2 gets deleted, while segment 1 is still around. Now key 1 + # is suddenly undeleted (because the delete in segment 2 is now missing). + # Again, note the requirement here. We delete these in the correct order that this doesn't happen, + # and only if the FS materialization of these deletes is reordered or parts dropped this can happen. + # In this case it doesn't cause outright corruption, 'just' an index count mismatch, which will be + # fixed by borg-check --repair. + # + # Note that in this check the index state is the proxy for a "most definitely settled" repository state, + # ie. the assumption is that *all* operations on segments <= index state are completed and stable. try: - self.io.write_delete(key, raise_full=save_space) + new_segment = self.io.write_delete(key, raise_full=save_space) except LoggedIO.SegmentFull: complete_xfer() - self.io.write_delete(key) + new_segment = self.io.write_delete(key) + self.compact.add(new_segment) + self.segments.setdefault(new_segment, 0) assert segments[segment] == 0 unused.append(segment) complete_xfer() - self.compact = set() + # Moving of deletes creates new sparse segments, only store these. All other segments + # are compact now. + self.compact = {segment for segment in self.compact if segment >= first_new_segment} def replay_segments(self, index_transaction_id, segments_transaction_id): # fake an old client, so that in case we do not have an exclusive lock yet, prepare_txn will upgrade the lock: From 3b716f98ff6bdce75f7ce51d0123e3551488617b Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Sat, 13 Aug 2016 01:47:51 +0200 Subject: [PATCH 19/22] Add regression test for 95cf337 --- borg/testsuite/repository.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/borg/testsuite/repository.py b/borg/testsuite/repository.py index c50e785bb..9c5a5a466 100644 --- a/borg/testsuite/repository.py +++ b/borg/testsuite/repository.py @@ -8,8 +8,9 @@ from ..hashindex import NSIndex from ..helpers import Location, IntegrityError from ..locking import Lock, LockFailed from ..remote import RemoteRepository, InvalidRPCMethod -from ..repository import Repository, LoggedIO, TAG_COMMIT, MAX_DATA_SIZE +from ..repository import Repository, LoggedIO, TAG_DELETE, MAX_DATA_SIZE from . import BaseTestCase +from .hashindex import H UNSPECIFIED = object() # for default values where we can't use None @@ -227,6 +228,28 @@ class RepositoryCommitTestCase(RepositoryTestCaseBase): io = self.repository.io assert not io.is_committed_segment(io.get_latest_segment()) + def test_moved_deletes_are_tracked(self): + self.repository.put(H(1), b'1') + self.repository.put(H(2), b'2') + self.repository.commit() + self.repository.delete(H(1)) + self.repository.commit() + last_segment = self.repository.io.get_latest_segment() + num_deletes = 0 + for tag, key, offset, data in self.repository.io.iter_objects(last_segment, include_data=True): + if tag == TAG_DELETE: + assert key == H(1) + num_deletes += 1 + assert num_deletes == 1 + assert last_segment in self.repository.compact + self.repository.put(H(3), b'3') + self.repository.commit() + assert last_segment not in self.repository.compact + assert not self.repository.io.segment_exists(last_segment) + last_segment = self.repository.io.get_latest_segment() + for tag, key, offset in self.repository.io.iter_objects(last_segment): + assert tag != TAG_DELETE + class RepositoryAppendOnlyTestCase(RepositoryTestCaseBase): def open(self, create=False): From 07b47ef4a54d367162a4c1d2777ac573f5949e43 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 13 Aug 2016 00:49:07 +0200 Subject: [PATCH 20/22] update CHANGES --- docs/changes.rst | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/docs/changes.rst b/docs/changes.rst index 4972dc382..601008417 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -58,8 +58,8 @@ Security fixes: - fix security issue with remote repository access, #1428 -Version 1.0.7rc2 (not released yet) ------------------------------------ +Version 1.0.7rc2 (2016-08-13) +----------------------------- Bug fixes: @@ -68,11 +68,33 @@ Bug fixes: IMPORTANT: if you created archives with many millions of files or directories, please verify if you can open them successfully, e.g. try a "borg list REPO::ARCHIVE". -- fixed a race condition in extended attributes querying that led to the - entire file not being backed up (while logging the error, exit code = 1), - #1469 -- fixed a race condition in extended attributes querying that led to a crash, - #1462 +- lz4 compression: dynamically enlarge the (de)compression buffer, the static + buffer was not big enough for archives with extremely many items, #1453 +- larger item metadata stream chunks, raise archive limit by 8x, #1452 +- fix untracked segments made by moved DELETEs, #1442 +- extended attributes (xattrs) related fixes: + + - fixed a race condition in xattrs querying that led to the entire file not + being backed up (while logging the error, exit code = 1), #1469 + - fixed a race condition in xattrs querying that led to a crash, #1462 + - raise OSError including the error message derived from errno, deal with + path being a integer FD + +Other changes: + +- print active env var override by default, #1467 +- xattr module: refactor code, deduplicate, clean up +- repository: split object size check into too small and too big +- add a transaction_id assertion, so borg init on a broken (inconsistent) + filesystem does not look like a coding error in borg, but points to the + real problem. +- explain confusing TypeError caused by compat support for old servers, #1456 +- add forgotten usage help file from build_usage +- refactor/unify buffer code into helpers.Buffer class, add tests +- docs: + + - document archive limitation, #1452 + - improve prune examples Version 1.0.7rc1 (2016-08-05) From 17aacb971948abf95e0a51f276f2e4759dd33864 Mon Sep 17 00:00:00 2001 From: enkore Date: Sat, 13 Aug 2016 10:18:41 +0200 Subject: [PATCH 21/22] Fix changes.rst formatting, clarify changelog --- docs/changes.rst | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/changes.rst b/docs/changes.rst index 601008417..7da62747a 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -65,13 +65,17 @@ Bug fixes: - do not write objects to repository that are bigger than the allowed size, borg will reject reading them, #1451. - IMPORTANT: if you created archives with many millions of files or - directories, please verify if you can open them successfully, - e.g. try a "borg list REPO::ARCHIVE". + + Important: if you created archives with many millions of files or + directories, please verify if you can open them successfully, + e.g. try a "borg list REPO::ARCHIVE". - lz4 compression: dynamically enlarge the (de)compression buffer, the static buffer was not big enough for archives with extremely many items, #1453 -- larger item metadata stream chunks, raise archive limit by 8x, #1452 +- larger item metadata stream chunks, raise archive item limit by 8x, #1452 - fix untracked segments made by moved DELETEs, #1442 + + Impact: Previously (metadata) segments could become untracked when deleting data, + these would never be cleaned up. - extended attributes (xattrs) related fixes: - fixed a race condition in xattrs querying that led to the entire file not From 42b6a838da46c87d85b5b32d45635ac25aec01fd Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 14 Aug 2016 15:07:18 +0200 Subject: [PATCH 22/22] fix cyclic import issue, fix tests needed to increase ChunkBuffer size due to increased items stream chunk size to get the test working. --- src/borg/archive.py | 2 +- src/borg/helpers.py | 2 +- src/borg/testsuite/archive.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 52a03af85..35ccff0a1 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -169,7 +169,7 @@ class DownloadPipeline: class ChunkBuffer: - BUFFER_SIZE = 1 * 1024 * 1024 + BUFFER_SIZE = 8 * 1024 * 1024 def __init__(self, key, chunker_params=ITEMS_CHUNKER_PARAMS): self.buffer = BytesIO() diff --git a/src/borg/helpers.py b/src/borg/helpers.py index 728bc6eff..2324d32a9 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -39,7 +39,6 @@ from . import crypto from . import hashindex from . import shellpattern from .constants import * # NOQA -from .compress import get_compressor # meta dict, data bytes _Chunk = namedtuple('_Chunk', 'meta data') @@ -1584,6 +1583,7 @@ class CompressionDecider2: return compr_spec, chunk def heuristic_lz4(self, compr_args, chunk): + from .compress import get_compressor meta, data = chunk lz4 = get_compressor('lz4') cdata = lz4.compress(data) diff --git a/src/borg/testsuite/archive.py b/src/borg/testsuite/archive.py index 30f619747..19db1a44c 100644 --- a/src/borg/testsuite/archive.py +++ b/src/borg/testsuite/archive.py @@ -109,7 +109,7 @@ class ChunkBufferTestCase(BaseTestCase): self.assert_equal(data, [Item(internal_dict=d) for d in unpacker]) def test_partial(self): - big = "0123456789" * 10000 + big = "0123456789abcdefghijklmnopqrstuvwxyz" * 25000 data = [Item(path='full', source=big), Item(path='partial', source=big)] cache = MockCache() key = PlaintextKey(None)