diff --git a/src/borg/testsuite/chunkers/__init__.py b/src/borg/testsuite/chunkers/__init__.py index 6e6eaa5de..26714517a 100644 --- a/src/borg/testsuite/chunkers/__init__.py +++ b/src/borg/testsuite/chunkers/__init__.py @@ -1,5 +1,10 @@ +import os +import tempfile + from borg.constants import * # noqa +from ...chunkers import has_seek_hole + def cf(chunks): """chunk filter""" @@ -15,3 +20,66 @@ def cf(chunks): assert False, "unexpected allocation value" return [_cf(chunk) for chunk in chunks] + + +def make_sparsefile(fname, sparsemap, header_size=0): + with open(fname, "wb") as fd: + total = 0 + if header_size: + fd.write(b"H" * header_size) + total += header_size + for offset, size, is_data in sparsemap: + if is_data: + fd.write(b"X" * size) + else: + fd.seek(size, os.SEEK_CUR) + total += size + fd.truncate(total) + assert os.path.getsize(fname) == total + + +def make_content(sparsemap, header_size=0): + result = [] + total = 0 + if header_size: + result.append(b"H" * header_size) + total += header_size + for offset, size, is_data in sparsemap: + if is_data: + result.append(b"X" * size) # bytes! + else: + result.append(size) # int! + total += size + return result + + +def fs_supports_sparse(): + if not has_seek_hole: + return False + with tempfile.TemporaryDirectory() as tmpdir: + fn = os.path.join(tmpdir, "test_sparse") + make_sparsefile(fn, [(0, BS, False), (BS, BS, True)]) + with open(fn, "rb") as f: + try: + offset_hole = f.seek(0, os.SEEK_HOLE) + offset_data = f.seek(0, os.SEEK_DATA) + except OSError: + # no sparse support if these seeks do not work + return False + return offset_hole == 0 and offset_data == BS + + +BS = 4096 # fs block size + +# some sparse files. X = content blocks, _ = sparse blocks. +# X__XXX____ +map_sparse1 = [(0 * BS, 1 * BS, True), (1 * BS, 2 * BS, False), (3 * BS, 3 * BS, True), (6 * BS, 4 * BS, False)] + +# _XX___XXXX +map_sparse2 = [(0 * BS, 1 * BS, False), (1 * BS, 2 * BS, True), (3 * BS, 3 * BS, False), (6 * BS, 4 * BS, True)] + +# XXX +map_notsparse = [(0 * BS, 3 * BS, True)] + +# ___ +map_onlysparse = [(0 * BS, 3 * BS, False)] diff --git a/src/borg/testsuite/chunkers/buzhash_pytest_test.py b/src/borg/testsuite/chunkers/buzhash_pytest_test.py index 8f436ec2d..3b33a1cc9 100644 --- a/src/borg/testsuite/chunkers/buzhash_pytest_test.py +++ b/src/borg/testsuite/chunkers/buzhash_pytest_test.py @@ -1,5 +1,6 @@ from hashlib import sha256 from io import BytesIO +import os from . import cf from ...chunkers import Chunker @@ -39,3 +40,30 @@ def test_chunkpoints_unchanged(): # Future chunker optimisations must not change this, or existing repos will bloat. overall_hash = H(b"".join(runs)) assert overall_hash == hex_to_bin("a43d0ecb3ae24f38852fcc433a83dacd28fe0748d09cc73fc11b69cf3f1a7299") + + +def test_buzhash_chunksize_distribution(): + data = os.urandom(1048576) + min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB + chunker = Chunker(0, min_exp, max_exp, mask, 4095) + f = BytesIO(data) + chunks = cf(chunker.chunkify(f)) + del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp + chunk_sizes = [len(chunk) for chunk in chunks] + chunks_count = len(chunks) + min_chunksize_observed = min(chunk_sizes) + max_chunksize_observed = max(chunk_sizes) + min_count = sum(int(size == 2**min_exp) for size in chunk_sizes) + max_count = sum(int(size == 2**max_exp) for size in chunk_sizes) + print( + f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} " + f"min count: {min_count} max count: {max_count}" + ) + # usually there will about 64 chunks + assert 32 < chunks_count < 128 + # chunks always must be between min and max (clipping must work): + assert min_chunksize_observed >= 2**min_exp + assert max_chunksize_observed <= 2**max_exp + # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size: + assert min_count < 10 + assert max_count < 10 diff --git a/src/borg/testsuite/chunkers/failing_pytest_test.py b/src/borg/testsuite/chunkers/failing_pytest_test.py new file mode 100644 index 000000000..cb558d250 --- /dev/null +++ b/src/borg/testsuite/chunkers/failing_pytest_test.py @@ -0,0 +1,32 @@ +from io import BytesIO + +import pytest + +from ...chunkers import ChunkerFailing +from ...constants import * # NOQA + + +def test_chunker_failing(): + SIZE = 4096 + data = bytes(2 * SIZE + 1000) + chunker = ChunkerFailing(SIZE, "rEErrr") # cut chunks, start failing at block 1, fail 2 times + with BytesIO(data) as fd: + ch = chunker.chunkify(fd) + c1 = next(ch) # block 0: ok + assert c1.meta["allocation"] == CH_DATA + assert c1.data == data[:SIZE] + with pytest.raises(OSError): # block 1: failure 1 + next(ch) + with BytesIO(data) as fd: + ch = chunker.chunkify(fd) + with pytest.raises(OSError): # block 2: failure 2 + next(ch) + with BytesIO(data) as fd: + ch = chunker.chunkify(fd) + c1 = next(ch) # block 3: success! + c2 = next(ch) # block 4: success! + c3 = next(ch) # block 5: success! + assert c1.meta["allocation"] == c2.meta["allocation"] == c3.meta["allocation"] == CH_DATA + assert c1.data == data[:SIZE] + assert c2.data == data[SIZE : 2 * SIZE] + assert c3.data == data[2 * SIZE :] diff --git a/src/borg/testsuite/chunkers/fixed_pytest_test.py b/src/borg/testsuite/chunkers/fixed_pytest_test.py new file mode 100644 index 000000000..2ab5faf48 --- /dev/null +++ b/src/borg/testsuite/chunkers/fixed_pytest_test.py @@ -0,0 +1,39 @@ +import pytest + +from . import cf, make_sparsefile, make_content, fs_supports_sparse +from . import BS, map_sparse1, map_sparse2, map_onlysparse, map_notsparse +from ...chunkers import ChunkerFixed +from ...constants import * # NOQA + + +@pytest.mark.skipif(not fs_supports_sparse(), reason="fs does not support sparse files") +@pytest.mark.parametrize( + "fname, sparse_map, header_size, sparse", + [ + ("sparse1", map_sparse1, 0, False), + ("sparse1", map_sparse1, 0, True), + ("sparse1", map_sparse1, BS, False), + ("sparse1", map_sparse1, BS, True), + ("sparse2", map_sparse2, 0, False), + ("sparse2", map_sparse2, 0, True), + ("sparse2", map_sparse2, BS, False), + ("sparse2", map_sparse2, BS, True), + ("onlysparse", map_onlysparse, 0, False), + ("onlysparse", map_onlysparse, 0, True), + ("onlysparse", map_onlysparse, BS, False), + ("onlysparse", map_onlysparse, BS, True), + ("notsparse", map_notsparse, 0, False), + ("notsparse", map_notsparse, 0, True), + ("notsparse", map_notsparse, BS, False), + ("notsparse", map_notsparse, BS, True), + ], +) +def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse): + def get_chunks(fname, sparse, header_size): + chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse) + with open(fname, "rb") as fd: + return cf(chunker.chunkify(fd)) + + fn = str(tmpdir / fname) + make_sparsefile(fn, sparse_map, header_size=header_size) + get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size) diff --git a/src/borg/testsuite/chunkers/chunker_pytest_test.py b/src/borg/testsuite/chunkers/reader_pytest_test.py similarity index 68% rename from src/borg/testsuite/chunkers/chunker_pytest_test.py rename to src/borg/testsuite/chunkers/reader_pytest_test.py index d2cf83c1c..f2e50b6d8 100644 --- a/src/borg/testsuite/chunkers/chunker_pytest_test.py +++ b/src/borg/testsuite/chunkers/reader_pytest_test.py @@ -1,84 +1,13 @@ -from io import BytesIO import os -import tempfile +from io import BytesIO import pytest -from . import cf -from ...chunkers import ( - Chunker, - ChunkerFixed, - sparsemap, - has_seek_hole, - ChunkerFailing, - FileReader, - FileFMAPReader, - Chunk, -) +from . import make_sparsefile, fs_supports_sparse +from . import BS, map_sparse1, map_sparse2, map_onlysparse, map_notsparse +from ...chunkers import sparsemap, FileReader, FileFMAPReader, Chunk from ...constants import * # NOQA -BS = 4096 # fs block size - -# some sparse files. X = content blocks, _ = sparse blocks. -# X__XXX____ -map_sparse1 = [(0 * BS, 1 * BS, True), (1 * BS, 2 * BS, False), (3 * BS, 3 * BS, True), (6 * BS, 4 * BS, False)] - -# _XX___XXXX -map_sparse2 = [(0 * BS, 1 * BS, False), (1 * BS, 2 * BS, True), (3 * BS, 3 * BS, False), (6 * BS, 4 * BS, True)] - -# XXX -map_notsparse = [(0 * BS, 3 * BS, True)] - -# ___ -map_onlysparse = [(0 * BS, 3 * BS, False)] - - -def make_sparsefile(fname, sparsemap, header_size=0): - with open(fname, "wb") as fd: - total = 0 - if header_size: - fd.write(b"H" * header_size) - total += header_size - for offset, size, is_data in sparsemap: - if is_data: - fd.write(b"X" * size) - else: - fd.seek(size, os.SEEK_CUR) - total += size - fd.truncate(total) - assert os.path.getsize(fname) == total - - -def make_content(sparsemap, header_size=0): - result = [] - total = 0 - if header_size: - result.append(b"H" * header_size) - total += header_size - for offset, size, is_data in sparsemap: - if is_data: - result.append(b"X" * size) # bytes! - else: - result.append(size) # int! - total += size - return result - - -def fs_supports_sparse(): - if not has_seek_hole: - return False - with tempfile.TemporaryDirectory() as tmpdir: - fn = os.path.join(tmpdir, "test_sparse") - make_sparsefile(fn, [(0, BS, False), (BS, BS, True)]) - with open(fn, "rb") as f: - try: - offset_hole = f.seek(0, os.SEEK_HOLE) - offset_data = f.seek(0, os.SEEK_DATA) - except OSError: - # no sparse support if these seeks do not work - return False - return offset_hole == 0 and offset_data == BS - @pytest.mark.skipif(not fs_supports_sparse(), reason="fs does not support sparse files") @pytest.mark.parametrize( @@ -103,92 +32,6 @@ def test_sparsemap(tmpdir, fname, sparse_map): assert get_sparsemap_fd(fn) == sparse_map -@pytest.mark.skipif(not fs_supports_sparse(), reason="fs does not support sparse files") -@pytest.mark.parametrize( - "fname, sparse_map, header_size, sparse", - [ - ("sparse1", map_sparse1, 0, False), - ("sparse1", map_sparse1, 0, True), - ("sparse1", map_sparse1, BS, False), - ("sparse1", map_sparse1, BS, True), - ("sparse2", map_sparse2, 0, False), - ("sparse2", map_sparse2, 0, True), - ("sparse2", map_sparse2, BS, False), - ("sparse2", map_sparse2, BS, True), - ("onlysparse", map_onlysparse, 0, False), - ("onlysparse", map_onlysparse, 0, True), - ("onlysparse", map_onlysparse, BS, False), - ("onlysparse", map_onlysparse, BS, True), - ("notsparse", map_notsparse, 0, False), - ("notsparse", map_notsparse, 0, True), - ("notsparse", map_notsparse, BS, False), - ("notsparse", map_notsparse, BS, True), - ], -) -def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse): - def get_chunks(fname, sparse, header_size): - chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse) - with open(fname, "rb") as fd: - return cf(chunker.chunkify(fd)) - - fn = str(tmpdir / fname) - make_sparsefile(fn, sparse_map, header_size=header_size) - get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size) - - -def test_chunker_failing(): - SIZE = 4096 - data = bytes(2 * SIZE + 1000) - chunker = ChunkerFailing(SIZE, "rEErrr") # cut chunks, start failing at block 1, fail 2 times - with BytesIO(data) as fd: - ch = chunker.chunkify(fd) - c1 = next(ch) # block 0: ok - assert c1.meta["allocation"] == CH_DATA - assert c1.data == data[:SIZE] - with pytest.raises(OSError): # block 1: failure 1 - next(ch) - with BytesIO(data) as fd: - ch = chunker.chunkify(fd) - with pytest.raises(OSError): # block 2: failure 2 - next(ch) - with BytesIO(data) as fd: - ch = chunker.chunkify(fd) - c1 = next(ch) # block 3: success! - c2 = next(ch) # block 4: success! - c3 = next(ch) # block 5: success! - assert c1.meta["allocation"] == c2.meta["allocation"] == c3.meta["allocation"] == CH_DATA - assert c1.data == data[:SIZE] - assert c2.data == data[SIZE : 2 * SIZE] - assert c3.data == data[2 * SIZE :] - - -def test_buzhash_chunksize_distribution(): - data = os.urandom(1048576) - min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB - chunker = Chunker(0, min_exp, max_exp, mask, 4095) - f = BytesIO(data) - chunks = cf(chunker.chunkify(f)) - del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp - chunk_sizes = [len(chunk) for chunk in chunks] - chunks_count = len(chunks) - min_chunksize_observed = min(chunk_sizes) - max_chunksize_observed = max(chunk_sizes) - min_count = sum(int(size == 2**min_exp) for size in chunk_sizes) - max_count = sum(int(size == 2**max_exp) for size in chunk_sizes) - print( - f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} " - f"min count: {min_count} max count: {max_count}" - ) - # usually there will about 64 chunks - assert 32 < chunks_count < 128 - # chunks always must be between min and max (clipping must work): - assert min_chunksize_observed >= 2**min_exp - assert max_chunksize_observed <= 2**max_exp - # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size: - assert min_count < 10 - assert max_count < 10 - - @pytest.mark.parametrize( "file_content, read_size, expected_data, expected_allocation, expected_size", [