From 965e8a957fa7516d8bfee19b000a4a41a4f2e514 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 19 May 2023 16:40:44 +0200 Subject: [PATCH 1/2] tests: check buzhash chunksize distribution, see #7586 --- src/borg/testsuite/chunker_pytest.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/borg/testsuite/chunker_pytest.py b/src/borg/testsuite/chunker_pytest.py index 59c7a4515..e83c8ef3e 100644 --- a/src/borg/testsuite/chunker_pytest.py +++ b/src/borg/testsuite/chunker_pytest.py @@ -5,7 +5,7 @@ import tempfile import pytest from .chunker import cf -from ..chunker import ChunkerFixed, sparsemap, has_seek_hole +from ..chunker import Chunker, ChunkerFixed, sparsemap, has_seek_hole from ..constants import * # NOQA BS = 4096 # fs block size @@ -136,3 +136,27 @@ def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse): fn = str(tmpdir / fname) make_sparsefile(fn, sparse_map, header_size=header_size) get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size) + + +def test_buzhash_chunksize_distribution(): + data = os.urandom(1048576) + min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB + chunker = Chunker(0, min_exp, max_exp, mask, 4095) + f = BytesIO(data) + chunks = cf(chunker.chunkify(f)) + chunk_sizes = [len(chunk) for chunk in chunks] + chunks_count = len(chunks) + min_chunksize_observed = min(chunk_sizes) + max_chunksize_observed = max(chunk_sizes) + min_count = sum((int(size == 2 ** min_exp) for size in chunk_sizes)) + max_count = sum((int(size == 2 ** max_exp) for size in chunk_sizes)) + print(f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} " + f"min count: {min_count} max count: {max_count}") + # usually there will about 64 chunks + assert 32 < chunks_count < 128 + # chunks always must be between min and max (clipping must work): + assert min_chunksize_observed >= 2 ** min_exp + assert max_chunksize_observed <= 2 ** max_exp + # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size: + assert min_count < 10 + assert max_count < 10 From 5cc4e3af6af2d6a9008740c2e41fe96a116c1046 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 19 May 2023 17:46:29 +0200 Subject: [PATCH 2/2] tests: only warn about "invalid" chunker params, fixes #7590 we previously allowed some weird chunker params, so we just warn about them instead of rejecting them. e.g. one could use super small chunk sizes. borg can do that, but you'll end up with a huge amount of chunks and very large hash tables (RAM usage) to manage them. also, one can violate the min <= mask <= max chunker param condition as in #7590 and it will somehow work, but will likely not dedup as good as when not violating that. borg2 **will** reject such strange chunker params, see #7586 for some ideas how to "fix" your repos. --- src/borg/helpers/parseformat.py | 24 ++++++++++++------------ src/borg/testsuite/helpers.py | 16 ++++++++++------ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 9dfda8d24..d96637b4d 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -95,10 +95,16 @@ def interval(s): def ChunkerParams(s): + def reject_or_warn(msg, reject): + if reject: + raise argparse.ArgumentTypeError(msg) + else: + logger.warning(msg) + params = s.strip().split(',') count = len(params) if count == 0: - raise argparse.ArgumentTypeError('no chunker params given') + reject_or_warn('no chunker params given', True) algo = params[0].lower() if algo == CH_FIXED and 2 <= count <= 3: # fixed, block_size[, header_size] block_size = int(params[1]) @@ -109,11 +115,9 @@ def ChunkerParams(s): # or in-memory chunk management. # choose the block (chunk) size wisely: if you have a lot of data and you cut # it into very small chunks, you are asking for trouble! - raise argparse.ArgumentTypeError('block_size must not be less than 64 Bytes') + reject_or_warn('block_size must not be less than 64 Bytes', False) if block_size > MAX_DATA_SIZE or header_size > MAX_DATA_SIZE: - raise argparse.ArgumentTypeError( - 'block_size and header_size must not exceed MAX_DATA_SIZE [%d]' % MAX_DATA_SIZE - ) + reject_or_warn('block_size and header_size must not exceed MAX_DATA_SIZE [%d]' % MAX_DATA_SIZE, True) return algo, block_size, header_size if algo == 'default' and count == 1: # default return CHUNKER_PARAMS @@ -121,16 +125,12 @@ def ChunkerParams(s): if algo == CH_BUZHASH and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4:]) if not (chunk_min <= chunk_mask <= chunk_max): - raise argparse.ArgumentTypeError('required: chunk_min <= chunk_mask <= chunk_max') + reject_or_warn('required: chunk_min <= chunk_mask <= chunk_max', False) if chunk_min < 6: # see comment in 'fixed' algo check - raise argparse.ArgumentTypeError( - 'min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)' - ) + reject_or_warn('min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)', False) if chunk_max > 23: - raise argparse.ArgumentTypeError( - 'max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)' - ) + reject_or_warn('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)', True) return CH_BUZHASH, chunk_min, chunk_max, chunk_mask, window_size raise argparse.ArgumentTypeError('invalid chunker params') diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index 99057a3d2..ed7140e5a 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -349,16 +349,20 @@ def test_chunkerparams(): assert ChunkerParams('fixed,4096') == ('fixed', 4096, 0) assert ChunkerParams('fixed,4096,200') == ('fixed', 4096, 200) # invalid values checking + borg2 = False # for borg < 2, we only emit a warning, but do not raise ArgumentTypeError for some cases with pytest.raises(ArgumentTypeError): ChunkerParams('crap,1,2,3,4') # invalid algo - with pytest.raises(ArgumentTypeError): - ChunkerParams('buzhash,5,7,6,4095') # too small min. size + if borg2: + with pytest.raises(ArgumentTypeError): + ChunkerParams('buzhash,5,7,6,4095') # too small min. size with pytest.raises(ArgumentTypeError): ChunkerParams('buzhash,19,24,21,4095') # too big max. size - with pytest.raises(ArgumentTypeError): - ChunkerParams('buzhash,23,19,21,4095') # violates min <= mask <= max - with pytest.raises(ArgumentTypeError): - ChunkerParams('fixed,63') # too small block size + if borg2: + with pytest.raises(ArgumentTypeError): + ChunkerParams('buzhash,23,19,21,4095') # violates min <= mask <= max + if borg2: + with pytest.raises(ArgumentTypeError): + ChunkerParams('fixed,63') # too small block size with pytest.raises(ArgumentTypeError): ChunkerParams('fixed,%d,%d' % (MAX_DATA_SIZE + 1, 4096)) # too big block size with pytest.raises(ArgumentTypeError):