add fuzzing tests for chunkers

2026-02-20 00:10:35 -05:00 · 2025-10-15 23:48:13 +02:00 · 2025-10-15 23:48:13 +02:00 · 084750ad48
commit 084750ad48
parent f401ca00f9
4 changed files with 128 additions and 3 deletions
--- a/src/borg/testsuite/chunkers/init.py
+++ b/src/borg/testsuite/chunkers/init.py
@ -22,6 +22,11 @@ def cf(chunks):
    return [_cf(chunk) for chunk in chunks]


+def cf_expand(chunks):
+    """same as cf, but do not return ints for HOLE and ALLOC, but all-zero bytestrings"""
+    return [ch if isinstance(ch, bytes) else b"\0" * ch for ch in cf(chunks)]
+
+
 def make_sparsefile(fname, sparsemap, header_size=0):
    with open(fname, "wb") as fd:
        total = 0
--- a/src/borg/testsuite/chunkers/buzhash64_test.py
+++ b/src/borg/testsuite/chunkers/buzhash64_test.py
@ -1,8 +1,11 @@
 from hashlib import sha256
 from io import BytesIO
 import os
+import random

-from . import cf
+import pytest
+
+from . import cf, cf_expand
 from ...chunkers import ChunkerBuzHash64
 from ...chunkers.buzhash64 import buzhash64_get_table
 from ...constants import *  # NOQA
@ -98,3 +101,42 @@ def test_buzhash64_table():
    for bit_pos in range(64):
        bit_count = sum(1 for value in table0 if value & (1 << bit_pos))
        assert bit_count == 128  # 50% of 256 = 128
+
+
+@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1")
+@pytest.mark.parametrize("worker", range(os.cpu_count() or 1))
+def test_fuzz_bh64(worker):
+    # Fuzz buzhash64 with random and uniform data of misc. sizes and misc keys.
+    def rnd_key():
+        return os.urandom(32)
+
+    # decompose CHUNKER64_PARAMS = (algo, min_exp, max_exp, mask_bits, window_size)
+    algo, min_exp, max_exp, mask_bits, win_size = CHUNKER64_PARAMS
+    assert algo == CH_BUZHASH64  # default chunker must be buzhash64 here
+
+    keys = [b"\0" * 32] + [rnd_key() for _ in range(10)]
+    sizes = [random.randint(1, 4 * 1024 * 1024) for _ in range(50)]
+
+    for key in keys:
+        chunker = ChunkerBuzHash64(key, min_exp, max_exp, mask_bits, win_size)
+        for size in sizes:
+            # Random data
+            data = os.urandom(size)
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            reconstructed = b"".join(parts)
+            assert reconstructed == data
+
+            # All-same data (non-zero)
+            data = b"\x42" * size
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            reconstructed = b"".join(parts)
+            assert reconstructed == data
+
+            # All-zero data
+            data = b"\x00" * size
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            reconstructed = b"".join(parts)
+            assert reconstructed == data
--- a/src/borg/testsuite/chunkers/buzhash_test.py
+++ b/src/borg/testsuite/chunkers/buzhash_test.py
@ -1,8 +1,11 @@
 from hashlib import sha256
 from io import BytesIO
 import os
+import random

-from . import cf
+import pytest
+
+from . import cf, cf_expand
 from ...chunkers import Chunker
 from ...constants import *  # NOQA
 from ...helpers import hex_to_bin
@ -67,3 +70,43 @@ def test_buzhash_chunksize_distribution():
    # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size:
    assert min_count < 10
    assert max_count < 10
+
+
+@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1")
+@pytest.mark.parametrize("worker", range(os.cpu_count() or 1))
+def test_fuzz_buzhash(worker):
+    # Fuzz the default chunker (buzhash) with random and uniform data of misc. sizes and seeds 0 or random int32 values.
+    def rnd_int32():
+        uint = random.getrandbits(32)
+        return uint if uint < 2**31 else uint - 2**32
+
+    # decompose CHUNKER_PARAMS = (algo, min_exp, max_exp, mask_bits, window_size)
+    algo, min_exp, max_exp, mask_bits, win_size = CHUNKER_PARAMS
+    assert algo == CH_BUZHASH  # default chunker must be buzhash here
+
+    seeds = [0] + [rnd_int32() for _ in range(50)]
+    sizes = [random.randint(1, 4 * 1024 * 1024) for _ in range(50)]
+
+    for seed in seeds:
+        chunker = Chunker(seed, min_exp, max_exp, mask_bits, win_size)
+        for size in sizes:
+            # Random data
+            data = os.urandom(size)
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            reconstructed = b"".join(parts)
+            assert reconstructed == data
+
+            # All-same data (non-zero)
+            data = b"\x42" * size
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            reconstructed = b"".join(parts)
+            assert reconstructed == data
+
+            # All-zero data
+            data = b"\x00" * size
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            reconstructed = b"".join(parts)
+            assert reconstructed == data
--- a/src/borg/testsuite/chunkers/fixed_test.py
+++ b/src/borg/testsuite/chunkers/fixed_test.py
@ -1,6 +1,10 @@
+from io import BytesIO
+import os
+import random
+
 import pytest

-from . import cf, make_sparsefile, make_content, fs_supports_sparse
+from . import cf, cf_expand, make_sparsefile, make_content, fs_supports_sparse
 from . import BS, map_sparse1, map_sparse2, map_onlysparse, map_notsparse
 from ...chunkers import ChunkerFixed
 from ...constants import *  # NOQA
@ -37,3 +41,34 @@ def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse):
    fn = str(tmpdir / fname)
    make_sparsefile(fn, sparse_map, header_size=header_size)
    get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size)
+
+
+@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1")
+@pytest.mark.parametrize("worker", range(os.cpu_count() or 1))
+def test_fuzz_fixed(worker):
+    # Fuzz fixed chunker with random and uniform data of misc. sizes.
+    sizes = [random.randint(1, 4 * 1024 * 1024) for _ in range(50)]
+
+    for block_size, header_size in [(1024, 64), (1234, 0), (4321, 123)]:
+        chunker = ChunkerFixed(block_size, header_size)
+        for size in sizes:
+            # Random data
+            data = os.urandom(size)
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            reconstructed = b"".join(parts)
+            assert reconstructed == data
+
+            # All-same data (non-zero)
+            data = b"\x42" * size
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            reconstructed = b"".join(parts)
+            assert reconstructed == data
+
+            # All-zero data
+            data = b"\x00" * size
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            reconstructed = b"".join(parts)
+            assert reconstructed == data