diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst index 5fac8fd3d..ff1136a60 100644 --- a/docs/internals/data-structures.rst +++ b/docs/internals/data-structures.rst @@ -462,7 +462,7 @@ can be used to tune the chunker parameters, the default is: - CHUNK_MIN_EXP = 19 (minimum chunk size = 2^19 B = 512 kiB) - CHUNK_MAX_EXP = 23 (maximum chunk size = 2^23 B = 8 MiB) - HASH_MASK_BITS = 21 (target chunk size ~= 2^21 B = 2 MiB) -- HASH_WINDOW_SIZE = 4095 [B] (`0xFFF`) +- HASH_WINDOW_SIZE = 4095 [B] (`0xFFF`) (must be an odd number) The buzhash table is altered by XORing it with a seed randomly generated once for the repository, and stored encrypted in the keyfile. This is to prevent diff --git a/docs/misc/create_chunker-params.txt b/docs/misc/create_chunker-params.txt index c091a2c14..f498ff03f 100644 --- a/docs/misc/create_chunker-params.txt +++ b/docs/misc/create_chunker-params.txt @@ -18,7 +18,7 @@ determined by the windows contents rather than the min/max. chunk size). Default: 21 (statistically, chunks will be about 2^21 == 2MiB in size) HASH_WINDOW_SIZE: the size of the window used for the rolling hash computation. -Default: 4095B +Must be an odd number. Default: 4095B Trying it out @@ -114,4 +114,3 @@ $ ls -l /extra/repo-xl/index* $ du -sk /extra/repo-xl/ 14253464 /extra/repo-xl/ - diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 9accbc4fa..cb481685a 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -201,6 +201,8 @@ def ChunkerParams(s): raise argparse.ArgumentTypeError( "max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)" ) + if window_size % 2 == 0: + raise argparse.ArgumentTypeError("window_size must be an uneven (odd) number") return CH_BUZHASH, chunk_min, chunk_max, chunk_mask, window_size raise argparse.ArgumentTypeError("invalid chunker params") diff --git a/src/borg/testsuite/archiver/recreate_cmd_test.py b/src/borg/testsuite/archiver/recreate_cmd_test.py index 7c2dc879e..4fae8e27d 100644 --- a/src/borg/testsuite/archiver/recreate_cmd_test.py +++ b/src/borg/testsuite/archiver/recreate_cmd_test.py @@ -138,7 +138,7 @@ def test_recreate_rechunkify(archivers, request): fd.write(b"a" * 280) fd.write(b"b" * 280) cmd(archiver, "repo-create", RK_ENCRYPTION) - cmd(archiver, "create", "test1", "input", "--chunker-params", "7,9,8,128") + cmd(archiver, "create", "test1", "input", "--chunker-params", "7,9,8,127") cmd(archiver, "create", "test2", "input", "--files-cache=disabled") num_chunks1 = int(cmd(archiver, "list", "test1", "input/large_file", "--format", "{num_chunks}")) num_chunks2 = int(cmd(archiver, "list", "test2", "input/large_file", "--format", "{num_chunks}")) @@ -159,7 +159,7 @@ def test_recreate_fixed_rechunkify(archivers, request): with open(os.path.join(archiver.input_path, "file"), "wb") as fd: fd.write(b"a" * 8192) cmd(archiver, "repo-create", RK_ENCRYPTION) - cmd(archiver, "create", "test", "input", "--chunker-params", "7,9,8,128") + cmd(archiver, "create", "test", "input", "--chunker-params", "7,9,8,127") output = cmd(archiver, "list", "test", "input/file", "--format", "{num_chunks}") num_chunks = int(output) assert num_chunks > 2 @@ -175,7 +175,7 @@ def test_recreate_no_rechunkify(archivers, request): fd.write(b"a" * 8192) cmd(archiver, "repo-create", RK_ENCRYPTION) # first create an archive with non-default chunker params: - cmd(archiver, "create", "test", "input", "--chunker-params", "7,9,8,128") + cmd(archiver, "create", "test", "input", "--chunker-params", "7,9,8,127") output = cmd(archiver, "list", "test", "input/file", "--format", "{num_chunks}") num_chunks = int(output) # now recreate the archive and do NOT specify chunker params: diff --git a/src/borg/testsuite/helpers/parseformat_test.py b/src/borg/testsuite/helpers/parseformat_test.py index d7cd002d0..ef39e6714 100644 --- a/src/borg/testsuite/helpers/parseformat_test.py +++ b/src/borg/testsuite/helpers/parseformat_test.py @@ -596,6 +596,7 @@ def test_valid_chunkerparams(chunker_params, expected_return): "buzhash,5,7,6,4095", # too small min. size "buzhash,19,24,21,4095", # too big max. size "buzhash,23,19,21,4095", # violates min <= mask <= max + "buzhash,19,23,21,4096", # even window size "fixed,63", # too small block size "fixed,%d,%d" % (MAX_DATA_SIZE + 1, 4096), # too big block size "fixed,%d,%d" % (4096, MAX_DATA_SIZE + 1), # too big header size