Merge pull request #7589 from ThomasWaldmann/chunker-params-1.2

relax chunker params validation, tests (1.2-maint)
This commit is contained in:
TW 2023-05-19 19:19:04 +02:00 committed by GitHub
commit 53bedfb63b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 47 additions and 19 deletions

View file

@ -95,10 +95,16 @@ def interval(s):
def ChunkerParams(s):
def reject_or_warn(msg, reject):
if reject:
raise argparse.ArgumentTypeError(msg)
else:
logger.warning(msg)
params = s.strip().split(',')
count = len(params)
if count == 0:
raise argparse.ArgumentTypeError('no chunker params given')
reject_or_warn('no chunker params given', True)
algo = params[0].lower()
if algo == CH_FIXED and 2 <= count <= 3: # fixed, block_size[, header_size]
block_size = int(params[1])
@ -109,11 +115,9 @@ def ChunkerParams(s):
# or in-memory chunk management.
# choose the block (chunk) size wisely: if you have a lot of data and you cut
# it into very small chunks, you are asking for trouble!
raise argparse.ArgumentTypeError('block_size must not be less than 64 Bytes')
reject_or_warn('block_size must not be less than 64 Bytes', False)
if block_size > MAX_DATA_SIZE or header_size > MAX_DATA_SIZE:
raise argparse.ArgumentTypeError(
'block_size and header_size must not exceed MAX_DATA_SIZE [%d]' % MAX_DATA_SIZE
)
reject_or_warn('block_size and header_size must not exceed MAX_DATA_SIZE [%d]' % MAX_DATA_SIZE, True)
return algo, block_size, header_size
if algo == 'default' and count == 1: # default
return CHUNKER_PARAMS
@ -121,16 +125,12 @@ def ChunkerParams(s):
if algo == CH_BUZHASH and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4:])
if not (chunk_min <= chunk_mask <= chunk_max):
raise argparse.ArgumentTypeError('required: chunk_min <= chunk_mask <= chunk_max')
reject_or_warn('required: chunk_min <= chunk_mask <= chunk_max', False)
if chunk_min < 6:
# see comment in 'fixed' algo check
raise argparse.ArgumentTypeError(
'min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)'
)
reject_or_warn('min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)', False)
if chunk_max > 23:
raise argparse.ArgumentTypeError(
'max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)'
)
reject_or_warn('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)', True)
return CH_BUZHASH, chunk_min, chunk_max, chunk_mask, window_size
raise argparse.ArgumentTypeError('invalid chunker params')

View file

@ -5,7 +5,7 @@ import tempfile
import pytest
from .chunker import cf
from ..chunker import ChunkerFixed, sparsemap, has_seek_hole
from ..chunker import Chunker, ChunkerFixed, sparsemap, has_seek_hole
from ..constants import * # NOQA
BS = 4096 # fs block size
@ -136,3 +136,27 @@ def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse):
fn = str(tmpdir / fname)
make_sparsefile(fn, sparse_map, header_size=header_size)
get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size)
def test_buzhash_chunksize_distribution():
data = os.urandom(1048576)
min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB
chunker = Chunker(0, min_exp, max_exp, mask, 4095)
f = BytesIO(data)
chunks = cf(chunker.chunkify(f))
chunk_sizes = [len(chunk) for chunk in chunks]
chunks_count = len(chunks)
min_chunksize_observed = min(chunk_sizes)
max_chunksize_observed = max(chunk_sizes)
min_count = sum((int(size == 2 ** min_exp) for size in chunk_sizes))
max_count = sum((int(size == 2 ** max_exp) for size in chunk_sizes))
print(f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} "
f"min count: {min_count} max count: {max_count}")
# usually there will about 64 chunks
assert 32 < chunks_count < 128
# chunks always must be between min and max (clipping must work):
assert min_chunksize_observed >= 2 ** min_exp
assert max_chunksize_observed <= 2 ** max_exp
# most chunks should be cut due to buzhash triggering, not due to clipping at min/max size:
assert min_count < 10
assert max_count < 10

View file

@ -349,16 +349,20 @@ def test_chunkerparams():
assert ChunkerParams('fixed,4096') == ('fixed', 4096, 0)
assert ChunkerParams('fixed,4096,200') == ('fixed', 4096, 200)
# invalid values checking
borg2 = False # for borg < 2, we only emit a warning, but do not raise ArgumentTypeError for some cases
with pytest.raises(ArgumentTypeError):
ChunkerParams('crap,1,2,3,4') # invalid algo
with pytest.raises(ArgumentTypeError):
ChunkerParams('buzhash,5,7,6,4095') # too small min. size
if borg2:
with pytest.raises(ArgumentTypeError):
ChunkerParams('buzhash,5,7,6,4095') # too small min. size
with pytest.raises(ArgumentTypeError):
ChunkerParams('buzhash,19,24,21,4095') # too big max. size
with pytest.raises(ArgumentTypeError):
ChunkerParams('buzhash,23,19,21,4095') # violates min <= mask <= max
with pytest.raises(ArgumentTypeError):
ChunkerParams('fixed,63') # too small block size
if borg2:
with pytest.raises(ArgumentTypeError):
ChunkerParams('buzhash,23,19,21,4095') # violates min <= mask <= max
if borg2:
with pytest.raises(ArgumentTypeError):
ChunkerParams('fixed,63') # too small block size
with pytest.raises(ArgumentTypeError):
ChunkerParams('fixed,%d,%d' % (MAX_DATA_SIZE + 1, 4096)) # too big block size
with pytest.raises(ArgumentTypeError):