From fa376b0d9de835dcbbb61ad1a631528f04325180 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 22 Nov 2025 03:58:35 +0100 Subject: [PATCH 1/9] Fix ChunkerFixed sparse handling and update tests - Update fixed_test.py expectations for non-sparse chunking. - Enable `sparse=True` in interaction_test.py and reader_test.py where zero detection is required. - Catch `ValueError` in _build_fmap to support `BytesIO` seeking. --- src/borg/chunkers/reader.pyx | 5 ++- src/borg/testsuite/chunkers/fixed_test.py | 35 ++++++++++++++++++- .../testsuite/chunkers/interaction_test.py | 2 +- src/borg/testsuite/chunkers/reader_test.py | 4 +-- 4 files changed, 41 insertions(+), 5 deletions(-) diff --git a/src/borg/chunkers/reader.pyx b/src/borg/chunkers/reader.pyx index 2a45e55ca..d33917a3a 100644 --- a/src/borg/chunkers/reader.pyx +++ b/src/borg/chunkers/reader.pyx @@ -137,7 +137,7 @@ class FileFMAPReader: if self.try_sparse: try: fmap = list(sparsemap(self.fd, self.fh)) - except OSError as err: + except (OSError, ValueError) as err: # seeking did not work pass @@ -170,6 +170,9 @@ class FileFMAPReader: # read block from the range data = dread(offset, wanted, self.fd, self.fh) got = len(data) + # Detect zero-filled blocks regardless of sparse mode. + # Zero detection is important to avoid reading/storing allocated zeros + # even when we are not using sparse file handling based on SEEK_HOLE/SEEK_DATA. if zeros.startswith(data): data = None allocation = CH_ALLOC diff --git a/src/borg/testsuite/chunkers/fixed_test.py b/src/borg/testsuite/chunkers/fixed_test.py index b8598a926..8fac894f1 100644 --- a/src/borg/testsuite/chunkers/fixed_test.py +++ b/src/borg/testsuite/chunkers/fixed_test.py @@ -40,7 +40,40 @@ def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse): fn = str(tmpdir / fname) make_sparsefile(fn, sparse_map, header_size=header_size) - get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size) + expected_content = make_content(sparse_map, header_size=header_size) + + # ChunkerFixed splits everything into fixed-size chunks (except maybe the header) + # We need to split the expected content similarly. + expected = [] + + # Handle header if present (it's the first item if header_size > 0) + if header_size > 0: + header = expected_content.pop(0) + expected.append(header) + + # Flatten the rest and split into 4096 chunks + current_chunk_size = 4096 + for item in expected_content: + if isinstance(item, int): + # Hole + count = item + while count > 0: + size = min(count, current_chunk_size) + expected.append(size) + count -= size + else: + # Data + data = item + while len(data) > 0: + size = min(len(data), current_chunk_size) + expected.append(data[:size]) + data = data[size:] + + if not sparse: + # if the chunker is not sparse-aware, it will read holes as zeros + expected = [b"\0" * x if isinstance(x, int) else x for x in expected] + + assert get_chunks(fn, sparse=sparse, header_size=header_size) == expected @pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1") diff --git a/src/borg/testsuite/chunkers/interaction_test.py b/src/borg/testsuite/chunkers/interaction_test.py index 45c175299..4417dc423 100644 --- a/src/borg/testsuite/chunkers/interaction_test.py +++ b/src/borg/testsuite/chunkers/interaction_test.py @@ -29,7 +29,7 @@ def test_reader_chunker_interaction(chunker_params): random_data = os.urandom(data_size // 3) + b"\0" * (data_size // 3) + os.urandom(data_size // 3) # Chunk the data - chunker = get_chunker(*chunker_params) + chunker = get_chunker(*chunker_params, sparse=True) data_file = BytesIO(random_data) chunks = list(chunker.chunkify(data_file)) diff --git a/src/borg/testsuite/chunkers/reader_test.py b/src/borg/testsuite/chunkers/reader_test.py index ff56350ee..f269d95cb 100644 --- a/src/borg/testsuite/chunkers/reader_test.py +++ b/src/borg/testsuite/chunkers/reader_test.py @@ -170,7 +170,7 @@ def test_filereader_read_with_mock(mock_chunks, read_size, expected_data, expect ) def test_filefmapreader_basic(file_content, read_size, expected_chunks): """Test basic functionality of FileFMAPReader with different file contents.""" - reader = FileFMAPReader(fd=BytesIO(file_content), fh=-1, read_size=read_size, sparse=False, fmap=None) + reader = FileFMAPReader(fd=BytesIO(file_content), fh=-1, read_size=read_size, sparse=True, fmap=None) # Collect all chunks from blockify chunks = list(reader.blockify()) @@ -252,7 +252,7 @@ def test_filefmapreader_allocation_types(zeros_length, read_size, expected_alloc # Create a file with all zeros file_content = b"\0" * zeros_length - reader = FileFMAPReader(fd=BytesIO(file_content), fh=-1, read_size=read_size, sparse=False, fmap=None) + reader = FileFMAPReader(fd=BytesIO(file_content), fh=-1, read_size=read_size, sparse=True, fmap=None) # Collect all chunks from blockify chunks = list(reader.blockify()) From 73bca86fb30dc22be94b8a68b2195e693d4d8003 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 26 Nov 2025 12:07:29 +0100 Subject: [PATCH 2/9] comment / use BS in test --- src/borg/chunkers/reader.pyx | 5 ++++- src/borg/testsuite/chunkers/fixed_test.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/borg/chunkers/reader.pyx b/src/borg/chunkers/reader.pyx index d33917a3a..115281d42 100644 --- a/src/borg/chunkers/reader.pyx +++ b/src/borg/chunkers/reader.pyx @@ -138,7 +138,10 @@ class FileFMAPReader: try: fmap = list(sparsemap(self.fd, self.fh)) except (OSError, ValueError) as err: - # seeking did not work + # Building a sparse map failed: + # - OSError: low-level lseek with SEEK_HOLE/SEEK_DATA not supported by FS/OS. + # - ValueError: high-level file objects (e.g. io.BytesIO or some fd wrappers) + # don't accept SEEK_HOLE/SEEK_DATA as a valid "whence" and raise ValueError. pass if fmap is None: diff --git a/src/borg/testsuite/chunkers/fixed_test.py b/src/borg/testsuite/chunkers/fixed_test.py index 8fac894f1..80782d4d0 100644 --- a/src/borg/testsuite/chunkers/fixed_test.py +++ b/src/borg/testsuite/chunkers/fixed_test.py @@ -34,7 +34,7 @@ from ...constants import * # NOQA ) def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse): def get_chunks(fname, sparse, header_size): - chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse) + chunker = ChunkerFixed(BS, header_size=header_size, sparse=sparse) with open(fname, "rb") as fd: return cf(chunker.chunkify(fd)) From 95e12fb66915097d52982d031799d5c79691d901 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 26 Nov 2025 12:11:59 +0100 Subject: [PATCH 3/9] simplify fixed test --- src/borg/testsuite/chunkers/fixed_test.py | 35 ++--------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/src/borg/testsuite/chunkers/fixed_test.py b/src/borg/testsuite/chunkers/fixed_test.py index 80782d4d0..a06d6bdb1 100644 --- a/src/borg/testsuite/chunkers/fixed_test.py +++ b/src/borg/testsuite/chunkers/fixed_test.py @@ -38,42 +38,11 @@ def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse): with open(fname, "rb") as fd: return cf(chunker.chunkify(fd)) + # this only works if sparse map blocks are same size as fixed chunker blocks fn = str(tmpdir / fname) make_sparsefile(fn, sparse_map, header_size=header_size) expected_content = make_content(sparse_map, header_size=header_size) - - # ChunkerFixed splits everything into fixed-size chunks (except maybe the header) - # We need to split the expected content similarly. - expected = [] - - # Handle header if present (it's the first item if header_size > 0) - if header_size > 0: - header = expected_content.pop(0) - expected.append(header) - - # Flatten the rest and split into 4096 chunks - current_chunk_size = 4096 - for item in expected_content: - if isinstance(item, int): - # Hole - count = item - while count > 0: - size = min(count, current_chunk_size) - expected.append(size) - count -= size - else: - # Data - data = item - while len(data) > 0: - size = min(len(data), current_chunk_size) - expected.append(data[:size]) - data = data[size:] - - if not sparse: - # if the chunker is not sparse-aware, it will read holes as zeros - expected = [b"\0" * x if isinstance(x, int) else x for x in expected] - - assert get_chunks(fn, sparse=sparse, header_size=header_size) == expected + assert get_chunks(fn, sparse=sparse, header_size=header_size) == expected_content @pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1") From 5a314d6f511888c2e8306e01ce802871ebdda8f3 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 26 Nov 2025 12:20:32 +0100 Subject: [PATCH 4/9] don't use sparse with BytesIO doesn't work anyway. --- src/borg/testsuite/chunkers/interaction_test.py | 2 +- src/borg/testsuite/chunkers/reader_test.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/borg/testsuite/chunkers/interaction_test.py b/src/borg/testsuite/chunkers/interaction_test.py index 4417dc423..45c175299 100644 --- a/src/borg/testsuite/chunkers/interaction_test.py +++ b/src/borg/testsuite/chunkers/interaction_test.py @@ -29,7 +29,7 @@ def test_reader_chunker_interaction(chunker_params): random_data = os.urandom(data_size // 3) + b"\0" * (data_size // 3) + os.urandom(data_size // 3) # Chunk the data - chunker = get_chunker(*chunker_params, sparse=True) + chunker = get_chunker(*chunker_params) data_file = BytesIO(random_data) chunks = list(chunker.chunkify(data_file)) diff --git a/src/borg/testsuite/chunkers/reader_test.py b/src/borg/testsuite/chunkers/reader_test.py index f269d95cb..ff56350ee 100644 --- a/src/borg/testsuite/chunkers/reader_test.py +++ b/src/borg/testsuite/chunkers/reader_test.py @@ -170,7 +170,7 @@ def test_filereader_read_with_mock(mock_chunks, read_size, expected_data, expect ) def test_filefmapreader_basic(file_content, read_size, expected_chunks): """Test basic functionality of FileFMAPReader with different file contents.""" - reader = FileFMAPReader(fd=BytesIO(file_content), fh=-1, read_size=read_size, sparse=True, fmap=None) + reader = FileFMAPReader(fd=BytesIO(file_content), fh=-1, read_size=read_size, sparse=False, fmap=None) # Collect all chunks from blockify chunks = list(reader.blockify()) @@ -252,7 +252,7 @@ def test_filefmapreader_allocation_types(zeros_length, read_size, expected_alloc # Create a file with all zeros file_content = b"\0" * zeros_length - reader = FileFMAPReader(fd=BytesIO(file_content), fh=-1, read_size=read_size, sparse=True, fmap=None) + reader = FileFMAPReader(fd=BytesIO(file_content), fh=-1, read_size=read_size, sparse=False, fmap=None) # Collect all chunks from blockify chunks = list(reader.blockify()) From e9b62dc29a99b6fafa80300ff50dc803b1e1c148 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 26 Nov 2025 12:41:17 +0100 Subject: [PATCH 5/9] maps: always use BS sized blocks the fixed chunker also cuts BS blocks. --- src/borg/testsuite/chunkers/__init__.py | 32 +++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/borg/testsuite/chunkers/__init__.py b/src/borg/testsuite/chunkers/__init__.py index 2e01e98e3..ce9ff11e2 100644 --- a/src/borg/testsuite/chunkers/__init__.py +++ b/src/borg/testsuite/chunkers/__init__.py @@ -77,14 +77,38 @@ def fs_supports_sparse(): BS = 4096 # filesystem block size # Some sparse files. X = content blocks, _ = sparse blocks. +# Block size must always be BS. + # X__XXX____ -map_sparse1 = [(0 * BS, 1 * BS, True), (1 * BS, 2 * BS, False), (3 * BS, 3 * BS, True), (6 * BS, 4 * BS, False)] +map_sparse1 = [ + (0, BS, True), + (1 * BS, BS, False), + (2 * BS, BS, False), + (3 * BS, BS, True), + (4 * BS, BS, True), + (5 * BS, BS, True), + (6 * BS, BS, False), + (7 * BS, BS, False), + (8 * BS, BS, False), + (9 * BS, BS, False), +] # _XX___XXXX -map_sparse2 = [(0 * BS, 1 * BS, False), (1 * BS, 2 * BS, True), (3 * BS, 3 * BS, False), (6 * BS, 4 * BS, True)] +map_sparse2 = [ + (0, BS, False), + (1 * BS, BS, True), + (2 * BS, BS, True), + (3 * BS, BS, False), + (4 * BS, BS, False), + (5 * BS, BS, False), + (6 * BS, BS, True), + (7 * BS, BS, True), + (8 * BS, BS, True), + (9 * BS, BS, True), +] # XXX -map_notsparse = [(0 * BS, 3 * BS, True)] +map_notsparse = [(0, BS, True), (BS, BS, True), (2 * BS, BS, True)] # ___ -map_onlysparse = [(0 * BS, 3 * BS, False)] +map_onlysparse = [(0, BS, False), (BS, BS, False), (2 * BS, BS, False)] From 9b8cfc7d83472a0e877b0d5c7d550d8d13698ece Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 29 Nov 2025 13:04:48 +0100 Subject: [PATCH 6/9] tests: pretty print --- src/borg/testsuite/chunkers/fixed_test.py | 40 +++++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/src/borg/testsuite/chunkers/fixed_test.py b/src/borg/testsuite/chunkers/fixed_test.py index a06d6bdb1..3bf78b809 100644 --- a/src/borg/testsuite/chunkers/fixed_test.py +++ b/src/borg/testsuite/chunkers/fixed_test.py @@ -4,13 +4,43 @@ import random import pytest -from . import cf, cf_expand, make_sparsefile, make_content, fs_supports_sparse +from . import cf, cf_expand, make_sparsefile, make_content from . import BS, map_sparse1, map_sparse2, map_onlysparse, map_notsparse from ...chunkers import ChunkerFixed from ...constants import * # NOQA -@pytest.mark.skipif(not fs_supports_sparse(), reason="filesystem does not support sparse files") +def pretty_print(msg, items): + """ + Pretty-print the result of get_chunks. + + For each element in the sequence: + - If it's a bytes object consisting solely of b"H", print "header length: X" where X is its length. + - If it's a bytes object consisting solely of b"X", print "body length: X" where X is its length. + - If it's an int, print "sparse: length: X" where X is the integer value (interpreted as a length). + """ + print(msg) + print("-" * len(msg)) + for item in items: + if isinstance(item, bytes): + # Detect sequences of only 'H' (header) or only 'X' (body) + if item.replace(b"H", b"") == b"": + print(f"header({len(item)})") + elif item.replace(b"X", b"") == b"": + print(f"body({len(item)})") + elif item.replace(b"\0", b"") == b"": + print(f"zeros({len(item)})") + else: + # Fallback: unknown content, print as body with its length + print(f"other({len(item)})") + elif isinstance(item, int): + print(f"sparse({item})") + else: + # Unexpected element type, just print a generic line. + print(f"???({item})") + + +# @pytest.mark.skipif(not fs_supports_sparse(), reason="filesystem does not support sparse files") @pytest.mark.parametrize( "fname, sparse_map, header_size, sparse", [ @@ -42,7 +72,11 @@ def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse): fn = str(tmpdir / fname) make_sparsefile(fn, sparse_map, header_size=header_size) expected_content = make_content(sparse_map, header_size=header_size) - assert get_chunks(fn, sparse=sparse, header_size=header_size) == expected_content + got_chunks = get_chunks(fn, sparse=sparse, header_size=header_size) + print(f"sparse: {sparse}") + pretty_print("expected", expected_content) + pretty_print("got", got_chunks) + assert expected_content == got_chunks @pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1") From 827b82938f21402aefb45e57ee95a29d34b9be7b Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 8 May 2026 23:58:54 +0200 Subject: [PATCH 7/9] FileReader.read: detect zero-filled slices from CH_DATA blocks When FileReader.read() sliced a large CH_DATA block (read at 1MB granularity) into smaller block_size chunks (e.g. 4096 bytes), zero-filled slices were returned as CH_DATA with zero bytes instead of CH_ALLOC. Add a zeros.startswith(result) check before returning a CH_DATA chunk, converting all-zero slices to CH_ALLOC. This ensures sparse-aware consumers correctly identify allocated-but-zero regions regardless of whether the file was read with sparse=True or sparse=False. --- src/borg/chunkers/reader.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/borg/chunkers/reader.pyx b/src/borg/chunkers/reader.pyx index 115281d42..7d9e35e8e 100644 --- a/src/borg/chunkers/reader.pyx +++ b/src/borg/chunkers/reader.pyx @@ -327,7 +327,12 @@ class FileReader: # Determine the allocation type of the resulting chunk if has_data: - # If any chunk was CH_DATA, the result is CH_DATA + # If any chunk was CH_DATA, check if the result is all zeros. + # This can happen when a large CH_DATA block (read at read_size granularity) + # contains both real data and zero-filled regions, and we are slicing out + # a zero-filled portion at the block_size granularity. + if zeros.startswith(result): + return Chunk(None, size=bytes_read, allocation=CH_ALLOC) return Chunk(bytes(result), size=bytes_read, allocation=CH_DATA) elif has_hole: # If any chunk was CH_HOLE (and none were CH_DATA), the result is CH_HOLE From 7b889d357dc16b1c66c33e000653fccf9a298a0c Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 9 May 2026 00:32:45 +0200 Subject: [PATCH 8/9] test_sparsemap: coalesce expected sparse map to match OS behavior The test_sparsemap tests were failing on Linux CI because SEEK_HOLE/ SEEK_DATA naturally coalesces adjacent ranges of the same type (data or hole), but the tests compared against the raw per-block sparse maps which list each block separately. Add a coalesce_sparse_map() helper that merges adjacent ranges with the same is_data flag, and compare sparsemap() output against the coalesced expected map instead of the raw per-block map. --- src/borg/testsuite/chunkers/reader_test.py | 23 ++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/borg/testsuite/chunkers/reader_test.py b/src/borg/testsuite/chunkers/reader_test.py index ff56350ee..dfebc24b0 100644 --- a/src/borg/testsuite/chunkers/reader_test.py +++ b/src/borg/testsuite/chunkers/reader_test.py @@ -9,6 +9,22 @@ from ...chunkers import sparsemap, FileReader, FileFMAPReader, Chunk from ...constants import * # NOQA +def coalesce_sparse_map(sparse_map): + """Coalesce adjacent ranges with the same is_data flag, as the OS would report them.""" + if not sparse_map: + return [] + result = [] + start, size, is_data = sparse_map[0] + for next_start, next_size, next_is_data in sparse_map[1:]: + if next_is_data == is_data: + size += next_size + else: + result.append((start, size, is_data)) + start, size, is_data = next_start, next_size, next_is_data + result.append((start, size, is_data)) + return result + + @pytest.mark.skipif(not fs_supports_sparse(), reason="filesystem does not support sparse files") @pytest.mark.parametrize( "fname, sparse_map", @@ -28,8 +44,11 @@ def test_sparsemap(tmpdir, fname, sparse_map): fn = str(tmpdir / fname) make_sparsefile(fn, sparse_map) - assert get_sparsemap_fh(fn) == sparse_map - assert get_sparsemap_fd(fn) == sparse_map + # The OS coalesces adjacent ranges of the same type (data or hole), + # so we compare against the coalesced version of the expected map. + expected = coalesce_sparse_map(sparse_map) + assert get_sparsemap_fh(fn) == expected + assert get_sparsemap_fd(fn) == expected @pytest.mark.parametrize( From 62d8762eb7cdb495f00d8473117b61857beaf5ca Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 9 May 2026 00:47:04 +0200 Subject: [PATCH 9/9] fixed_test: remove commented-out skipif decorator The @pytest.mark.skipif(not fs_supports_sparse(), ...) decorator on test_chunkify_sparse was commented out and is not needed because the zeros.startswith(result) fix in FileReader.read() detects zero-filled slices as CH_ALLOC regardless of sparse FS support, and ChunkerFixed with sparse=True gracefully falls back when SEEK_HOLE/SEEK_DATA is not available. --- src/borg/testsuite/chunkers/fixed_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/borg/testsuite/chunkers/fixed_test.py b/src/borg/testsuite/chunkers/fixed_test.py index 3bf78b809..8b8e628ef 100644 --- a/src/borg/testsuite/chunkers/fixed_test.py +++ b/src/borg/testsuite/chunkers/fixed_test.py @@ -40,7 +40,6 @@ def pretty_print(msg, items): print(f"???({item})") -# @pytest.mark.skipif(not fs_supports_sparse(), reason="filesystem does not support sparse files") @pytest.mark.parametrize( "fname, sparse_map, header_size, sparse", [