Merge pull request #8872 from ThomasWaldmann/chunker-params-reject-even-window-size

ChunkerParams: reject even window size for buzhash, fixes #8868
This commit is contained in:
TW 2025-05-23 07:36:23 +02:00 committed by GitHub
commit 2b655fccf7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 8 additions and 6 deletions

View file

@ -462,7 +462,7 @@ can be used to tune the chunker parameters, the default is:
- CHUNK_MIN_EXP = 19 (minimum chunk size = 2^19 B = 512 kiB)
- CHUNK_MAX_EXP = 23 (maximum chunk size = 2^23 B = 8 MiB)
- HASH_MASK_BITS = 21 (target chunk size ~= 2^21 B = 2 MiB)
- HASH_WINDOW_SIZE = 4095 [B] (`0xFFF`)
- HASH_WINDOW_SIZE = 4095 [B] (`0xFFF`) (must be an odd number)
The buzhash table is altered by XORing it with a seed randomly generated once
for the repository, and stored encrypted in the keyfile. This is to prevent

View file

@ -18,7 +18,7 @@ determined by the windows contents rather than the min/max. chunk size).
Default: 21 (statistically, chunks will be about 2^21 == 2MiB in size)
HASH_WINDOW_SIZE: the size of the window used for the rolling hash computation.
Default: 4095B
Must be an odd number. Default: 4095B
Trying it out
@ -114,4 +114,3 @@ $ ls -l /extra/repo-xl/index*
$ du -sk /extra/repo-xl/
14253464 /extra/repo-xl/

View file

@ -201,6 +201,8 @@ def ChunkerParams(s):
raise argparse.ArgumentTypeError(
"max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)"
)
if window_size % 2 == 0:
raise argparse.ArgumentTypeError("window_size must be an uneven (odd) number")
return CH_BUZHASH, chunk_min, chunk_max, chunk_mask, window_size
raise argparse.ArgumentTypeError("invalid chunker params")

View file

@ -138,7 +138,7 @@ def test_recreate_rechunkify(archivers, request):
fd.write(b"a" * 280)
fd.write(b"b" * 280)
cmd(archiver, "repo-create", RK_ENCRYPTION)
cmd(archiver, "create", "test1", "input", "--chunker-params", "7,9,8,128")
cmd(archiver, "create", "test1", "input", "--chunker-params", "7,9,8,127")
cmd(archiver, "create", "test2", "input", "--files-cache=disabled")
num_chunks1 = int(cmd(archiver, "list", "test1", "input/large_file", "--format", "{num_chunks}"))
num_chunks2 = int(cmd(archiver, "list", "test2", "input/large_file", "--format", "{num_chunks}"))
@ -159,7 +159,7 @@ def test_recreate_fixed_rechunkify(archivers, request):
with open(os.path.join(archiver.input_path, "file"), "wb") as fd:
fd.write(b"a" * 8192)
cmd(archiver, "repo-create", RK_ENCRYPTION)
cmd(archiver, "create", "test", "input", "--chunker-params", "7,9,8,128")
cmd(archiver, "create", "test", "input", "--chunker-params", "7,9,8,127")
output = cmd(archiver, "list", "test", "input/file", "--format", "{num_chunks}")
num_chunks = int(output)
assert num_chunks > 2
@ -175,7 +175,7 @@ def test_recreate_no_rechunkify(archivers, request):
fd.write(b"a" * 8192)
cmd(archiver, "repo-create", RK_ENCRYPTION)
# first create an archive with non-default chunker params:
cmd(archiver, "create", "test", "input", "--chunker-params", "7,9,8,128")
cmd(archiver, "create", "test", "input", "--chunker-params", "7,9,8,127")
output = cmd(archiver, "list", "test", "input/file", "--format", "{num_chunks}")
num_chunks = int(output)
# now recreate the archive and do NOT specify chunker params:

View file

@ -596,6 +596,7 @@ def test_valid_chunkerparams(chunker_params, expected_return):
"buzhash,5,7,6,4095", # too small min. size
"buzhash,19,24,21,4095", # too big max. size
"buzhash,23,19,21,4095", # violates min <= mask <= max
"buzhash,19,23,21,4096", # even window size
"fixed,63", # too small block size
"fixed,%d,%d" % (MAX_DATA_SIZE + 1, 4096), # too big block size
"fixed,%d,%d" % (4096, MAX_DATA_SIZE + 1), # too big header size