diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 9dfda8d24..d96637b4d 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -95,10 +95,16 @@ def interval(s): def ChunkerParams(s): + def reject_or_warn(msg, reject): + if reject: + raise argparse.ArgumentTypeError(msg) + else: + logger.warning(msg) + params = s.strip().split(',') count = len(params) if count == 0: - raise argparse.ArgumentTypeError('no chunker params given') + reject_or_warn('no chunker params given', True) algo = params[0].lower() if algo == CH_FIXED and 2 <= count <= 3: # fixed, block_size[, header_size] block_size = int(params[1]) @@ -109,11 +115,9 @@ def ChunkerParams(s): # or in-memory chunk management. # choose the block (chunk) size wisely: if you have a lot of data and you cut # it into very small chunks, you are asking for trouble! - raise argparse.ArgumentTypeError('block_size must not be less than 64 Bytes') + reject_or_warn('block_size must not be less than 64 Bytes', False) if block_size > MAX_DATA_SIZE or header_size > MAX_DATA_SIZE: - raise argparse.ArgumentTypeError( - 'block_size and header_size must not exceed MAX_DATA_SIZE [%d]' % MAX_DATA_SIZE - ) + reject_or_warn('block_size and header_size must not exceed MAX_DATA_SIZE [%d]' % MAX_DATA_SIZE, True) return algo, block_size, header_size if algo == 'default' and count == 1: # default return CHUNKER_PARAMS @@ -121,16 +125,12 @@ def ChunkerParams(s): if algo == CH_BUZHASH and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4:]) if not (chunk_min <= chunk_mask <= chunk_max): - raise argparse.ArgumentTypeError('required: chunk_min <= chunk_mask <= chunk_max') + reject_or_warn('required: chunk_min <= chunk_mask <= chunk_max', False) if chunk_min < 6: # see comment in 'fixed' algo check - raise argparse.ArgumentTypeError( - 'min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)' - ) + reject_or_warn('min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)', False) if chunk_max > 23: - raise argparse.ArgumentTypeError( - 'max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)' - ) + reject_or_warn('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)', True) return CH_BUZHASH, chunk_min, chunk_max, chunk_mask, window_size raise argparse.ArgumentTypeError('invalid chunker params') diff --git a/src/borg/testsuite/chunker_pytest.py b/src/borg/testsuite/chunker_pytest.py index 59c7a4515..e83c8ef3e 100644 --- a/src/borg/testsuite/chunker_pytest.py +++ b/src/borg/testsuite/chunker_pytest.py @@ -5,7 +5,7 @@ import tempfile import pytest from .chunker import cf -from ..chunker import ChunkerFixed, sparsemap, has_seek_hole +from ..chunker import Chunker, ChunkerFixed, sparsemap, has_seek_hole from ..constants import * # NOQA BS = 4096 # fs block size @@ -136,3 +136,27 @@ def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse): fn = str(tmpdir / fname) make_sparsefile(fn, sparse_map, header_size=header_size) get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size) + + +def test_buzhash_chunksize_distribution(): + data = os.urandom(1048576) + min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB + chunker = Chunker(0, min_exp, max_exp, mask, 4095) + f = BytesIO(data) + chunks = cf(chunker.chunkify(f)) + chunk_sizes = [len(chunk) for chunk in chunks] + chunks_count = len(chunks) + min_chunksize_observed = min(chunk_sizes) + max_chunksize_observed = max(chunk_sizes) + min_count = sum((int(size == 2 ** min_exp) for size in chunk_sizes)) + max_count = sum((int(size == 2 ** max_exp) for size in chunk_sizes)) + print(f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} " + f"min count: {min_count} max count: {max_count}") + # usually there will about 64 chunks + assert 32 < chunks_count < 128 + # chunks always must be between min and max (clipping must work): + assert min_chunksize_observed >= 2 ** min_exp + assert max_chunksize_observed <= 2 ** max_exp + # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size: + assert min_count < 10 + assert max_count < 10 diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index 99057a3d2..ed7140e5a 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -349,16 +349,20 @@ def test_chunkerparams(): assert ChunkerParams('fixed,4096') == ('fixed', 4096, 0) assert ChunkerParams('fixed,4096,200') == ('fixed', 4096, 200) # invalid values checking + borg2 = False # for borg < 2, we only emit a warning, but do not raise ArgumentTypeError for some cases with pytest.raises(ArgumentTypeError): ChunkerParams('crap,1,2,3,4') # invalid algo - with pytest.raises(ArgumentTypeError): - ChunkerParams('buzhash,5,7,6,4095') # too small min. size + if borg2: + with pytest.raises(ArgumentTypeError): + ChunkerParams('buzhash,5,7,6,4095') # too small min. size with pytest.raises(ArgumentTypeError): ChunkerParams('buzhash,19,24,21,4095') # too big max. size - with pytest.raises(ArgumentTypeError): - ChunkerParams('buzhash,23,19,21,4095') # violates min <= mask <= max - with pytest.raises(ArgumentTypeError): - ChunkerParams('fixed,63') # too small block size + if borg2: + with pytest.raises(ArgumentTypeError): + ChunkerParams('buzhash,23,19,21,4095') # violates min <= mask <= max + if borg2: + with pytest.raises(ArgumentTypeError): + ChunkerParams('fixed,63') # too small block size with pytest.raises(ArgumentTypeError): ChunkerParams('fixed,%d,%d' % (MAX_DATA_SIZE + 1, 4096)) # too big block size with pytest.raises(ArgumentTypeError):