From b9646f236ebd1a16cb49a8e8adaf550eea2391ac Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 6 Jun 2025 22:39:25 +0200 Subject: [PATCH] buzhash64: init table using a 256bit key derived from ID key That way we can feed lots of entropy into the table creation. The bh64_key is derived from the id_key (NOT the crypt_key), thus it will create the same key for related repositories (even if they use different encryption/authentication keys). Due to that, it will also create the same buzhash64 table, will cut chunks at the same points and deduplication will work amongst the related repositories. --- src/borg/chunkers/__init__.py | 10 ++-- src/borg/chunkers/buzhash64.pyi | 6 +-- src/borg/chunkers/buzhash64.pyx | 20 +++---- .../testsuite/chunkers/buzhash64_self_test.py | 52 +++++++++---------- src/borg/testsuite/chunkers/buzhash64_test.py | 8 +-- 5 files changed, 46 insertions(+), 50 deletions(-) diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py index c3c625760..463be44c8 100644 --- a/src/borg/chunkers/__init__.py +++ b/src/borg/chunkers/__init__.py @@ -3,7 +3,6 @@ from .buzhash64 import ChunkerBuzHash64 from .failing import ChunkerFailing from .fixed import ChunkerFixed from .reader import * # noqa -from ..crypto.key import PlaintextKey API_VERSION = "1.2_01" @@ -13,15 +12,12 @@ def get_chunker(algo, *params, **kw): sparse = kw.get("sparse", False) # key.chunk_seed only has 32bits seed = key.chunk_seed if key is not None else 0 - # we want 64bits for buzhash64, get them from crypt_key - if key is None or isinstance(key, PlaintextKey): - seed64 = 0 - else: - seed64 = int.from_bytes(key.crypt_key[:8], byteorder="little") + # for buzhash64, we want a much longer key, so we derive it from the id key + bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"" if algo == "buzhash": return Chunker(seed, *params, sparse=sparse) if algo == "buzhash64": - return ChunkerBuzHash64(seed64, *params, sparse=sparse) + return ChunkerBuzHash64(bh64_key, *params, sparse=sparse) if algo == "fixed": return ChunkerFixed(*params, sparse=sparse) if algo == "fail": diff --git a/src/borg/chunkers/buzhash64.pyi b/src/borg/chunkers/buzhash64.pyi index 4ca5359d3..3414bd609 100644 --- a/src/borg/chunkers/buzhash64.pyi +++ b/src/borg/chunkers/buzhash64.pyi @@ -4,13 +4,13 @@ from .reader import fmap_entry API_VERSION: str -def buzhash64(data: bytes, seed: int) -> int: ... -def buzhash64_update(sum: int, remove: int, add: int, len: int, seed: int) -> int: ... +def buzhash64(data: bytes, key: bytes) -> int: ... +def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ... class ChunkerBuzHash64: def __init__( self, - seed: int, + key: bytes, chunk_min_exp: int, chunk_max_exp: int, hash_mask_bits: int, diff --git a/src/borg/chunkers/buzhash64.pyx b/src/borg/chunkers/buzhash64.pyx index db264d74a..0199406fe 100644 --- a/src/borg/chunkers/buzhash64.pyx +++ b/src/borg/chunkers/buzhash64.pyx @@ -39,13 +39,13 @@ cdef extern from *: @cython.boundscheck(False) # Deactivate bounds checking @cython.wraparound(False) # Deactivate negative indexing. -cdef uint64_t* buzhash64_init_table(uint64_t seed): - """Initialize the buzhash table with the given seed.""" +cdef uint64_t* buzhash64_init_table(bytes key): + """Initialize the buzhash table using the given key.""" cdef int i cdef uint64_t* table = malloc(2048) # 256 * sizeof(uint64_t) for i in range(256): - # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the seed: - v = f"{i:02x}{seed:016x}".encode() + # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key: + v = f"{i:02x}".encode() + key d64 = sha256(v).digest()[:8] table[i] = int.from_bytes(d64, byteorder='little') return table @@ -99,7 +99,7 @@ cdef class ChunkerBuzHash64: cdef size_t reader_block_size cdef bint sparse - def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False): + def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False): min_size = 1 << chunk_min_exp max_size = 1 << chunk_max_exp assert max_size <= len(zeros) @@ -109,7 +109,7 @@ cdef class ChunkerBuzHash64: self.window_size = hash_window_size self.chunk_mask = (1 << hash_mask_bits) - 1 self.min_size = min_size - self.table = buzhash64_init_table(seed & 0xffffffffffffffff) + self.table = buzhash64_init_table(key) self.buf_size = max_size self.data = malloc(self.buf_size) self.fh = -1 @@ -274,18 +274,18 @@ cdef class ChunkerBuzHash64: return Chunk(data, size=got, allocation=allocation) -def buzhash64(data, unsigned long seed): +def buzhash64(data, bytes key): cdef uint64_t *table cdef uint64_t sum - table = buzhash64_init_table(seed & 0xffffffffffffffff) + table = buzhash64_init_table(key) sum = _buzhash64( data, len(data), table) free(table) return sum -def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed): +def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, bytes key): cdef uint64_t *table - table = buzhash64_init_table(seed & 0xffffffffffffffff) + table = buzhash64_init_table(key) sum = _buzhash64_update(sum, remove, add, len, table) free(table) return sum diff --git a/src/borg/testsuite/chunkers/buzhash64_self_test.py b/src/borg/testsuite/chunkers/buzhash64_self_test.py index a356afbbf..41198477d 100644 --- a/src/borg/testsuite/chunkers/buzhash64_self_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_self_test.py @@ -13,56 +13,56 @@ from . import cf class ChunkerBuzHash64TestCase(BaseTestCase): def test_chunkify64(self): data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y" - parts = cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))) + parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))) self.assert_equal(len(parts), 2) self.assert_equal(b"".join(parts), data) - self.assert_equal(cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), []) + self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), []) self.assert_equal( - cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"fo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"obaz"], + cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarboobazfoobarboobazfoobarboobaz"], + cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"], + cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobar", b"boobazfoo", b"barboobazfoo", b"barboobaz"], + cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"], - ) - self.assert_equal( - cf(ChunkerBuzHash64(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), [b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarboobazfoo", b"barboobazfoo", b"barboobaz"], + cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"], + cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), [b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"], ) + self.assert_equal( + cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"], + ) def test_buzhash64(self): - self.assert_equal(buzhash64(b"abcdefghijklmnop", 0), 13314711829666336849) - self.assert_equal(buzhash64(b"abcdefghijklmnop", 1), 17807676237451361719) - expected = buzhash64(b"abcdefghijklmnop", 1) - previous = buzhash64(b"Xabcdefghijklmno", 1) - this = buzhash64_update(previous, ord("X"), ord("p"), 16, 1) + self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478) + self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910) + expected = buzhash64(b"abcdefghijklmnop", b"1") + previous = buzhash64(b"Xabcdefghijklmno", b"1") + this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1") self.assert_equal(this, expected) # Test with more than 63 bytes to make sure our barrel_shift macro works correctly - self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, 0), 592868834756664313) + self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899) def test_small_reads64(self): class SmallReadFile: diff --git a/src/borg/testsuite/chunkers/buzhash64_test.py b/src/borg/testsuite/chunkers/buzhash64_test.py index fef302838..7a0019732 100644 --- a/src/borg/testsuite/chunkers/buzhash64_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_test.py @@ -30,22 +30,22 @@ def test_chunkpoints64_unchanged(): if minexp >= maxexp: continue for maskbits in (4, 7, 10, 12): - for seed in (1849058162, 1234567653): + for key in (b"first_key", b"second_key"): fh = BytesIO(data) - chunker = ChunkerBuzHash64(seed, minexp, maxexp, maskbits, winsize) + chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize) chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))] runs.append(H(b"".join(chunks))) # The "correct" hash below matches the existing chunker behavior. # Future chunker optimisations must not change this, or existing repos will bloat. overall_hash = H(b"".join(runs)) - assert overall_hash == hex_to_bin("fa9002758c0358721404f55f3020bb56b987cb3cd9a688ff9641f4023215f4e7") + assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc") def test_buzhash64_chunksize_distribution(): data = os.urandom(1048576) min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB - chunker = ChunkerBuzHash64(0, min_exp, max_exp, mask, 4095) + chunker = ChunkerBuzHash64(b"", min_exp, max_exp, mask, 4095) f = BytesIO(data) chunks = cf(chunker.chunkify(f)) del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp