buzhash64: init table using a 256bit key derived from ID key

That way we can feed lots of entropy into the table creation. The bh64_key is derived from the id_key (NOT the crypt_key), thus it will create the same key for related repositories (even if they use different encryption/authentication keys). Due to that, it will also create the same buzhash64 table, will cut chunks at the same points and deduplication will work amongst the related repositories.
2026-05-28 04:03:21 -04:00 · 2025-06-06 22:39:25 +02:00 · 2025-06-06 22:39:25 +02:00 · b9646f236e
commit b9646f236e
parent 544b3f41a9
5 changed files with 46 additions and 50 deletions
--- a/src/borg/chunkers/init.py
+++ b/src/borg/chunkers/init.py
@ -3,7 +3,6 @@ from .buzhash64 import ChunkerBuzHash64
 from .failing import ChunkerFailing
 from .fixed import ChunkerFixed
 from .reader import *  # noqa
-from ..crypto.key import PlaintextKey

 API_VERSION = "1.2_01"

@ -13,15 +12,12 @@ def get_chunker(algo, *params, **kw):
    sparse = kw.get("sparse", False)
    # key.chunk_seed only has 32bits
    seed = key.chunk_seed if key is not None else 0
-    # we want 64bits for buzhash64, get them from crypt_key
-    if key is None or isinstance(key, PlaintextKey):
-        seed64 = 0
-    else:
-        seed64 = int.from_bytes(key.crypt_key[:8], byteorder="little")
+    # for buzhash64, we want a much longer key, so we derive it from the id key
+    bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b""
    if algo == "buzhash":
        return Chunker(seed, *params, sparse=sparse)
    if algo == "buzhash64":
-        return ChunkerBuzHash64(seed64, *params, sparse=sparse)
+        return ChunkerBuzHash64(bh64_key, *params, sparse=sparse)
    if algo == "fixed":
        return ChunkerFixed(*params, sparse=sparse)
    if algo == "fail":
--- a/src/borg/chunkers/buzhash64.pyi
+++ b/src/borg/chunkers/buzhash64.pyi
@ -4,13 +4,13 @@ from .reader import fmap_entry

 API_VERSION: str

-def buzhash64(data: bytes, seed: int) -> int: ...
-def buzhash64_update(sum: int, remove: int, add: int, len: int, seed: int) -> int: ...
+def buzhash64(data: bytes, key: bytes) -> int: ...
+def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ...

 class ChunkerBuzHash64:
    def __init__(
        self,
-        seed: int,
+        key: bytes,
        chunk_min_exp: int,
        chunk_max_exp: int,
        hash_mask_bits: int,
--- a/src/borg/chunkers/buzhash64.pyx
+++ b/src/borg/chunkers/buzhash64.pyx
@ -39,13 +39,13 @@ cdef extern from *:

@cython.boundscheck(False)  # Deactivate bounds checking
@cython.wraparound(False)  # Deactivate negative indexing.
-cdef uint64_t* buzhash64_init_table(uint64_t seed):
-    """Initialize the buzhash table with the given seed."""
+cdef uint64_t* buzhash64_init_table(bytes key):
+    """Initialize the buzhash table using the given key."""
    cdef int i
    cdef uint64_t* table = <uint64_t*>malloc(2048)  # 256 * sizeof(uint64_t)
    for i in range(256):
-        # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the seed:
-        v = f"{i:02x}{seed:016x}".encode()
+        # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key:
+        v = f"{i:02x}".encode() + key
        d64 = sha256(v).digest()[:8]
        table[i] = <uint64_t> int.from_bytes(d64, byteorder='little')
    return table
@ -99,7 +99,7 @@ cdef class ChunkerBuzHash64:
    cdef size_t reader_block_size
    cdef bint sparse

-    def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
+    def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
        min_size = 1 << chunk_min_exp
        max_size = 1 << chunk_max_exp
        assert max_size <= len(zeros)
@ -109,7 +109,7 @@ cdef class ChunkerBuzHash64:
        self.window_size = hash_window_size
        self.chunk_mask = (1 << hash_mask_bits) - 1
        self.min_size = min_size
-        self.table = buzhash64_init_table(seed & 0xffffffffffffffff)
+        self.table = buzhash64_init_table(key)
        self.buf_size = max_size
        self.data = <uint8_t*>malloc(self.buf_size)
        self.fh = -1
@ -274,18 +274,18 @@ cdef class ChunkerBuzHash64:
        return Chunk(data, size=got, allocation=allocation)


-def buzhash64(data, unsigned long seed):
+def buzhash64(data, bytes key):
    cdef uint64_t *table
    cdef uint64_t sum
-    table = buzhash64_init_table(seed & 0xffffffffffffffff)
+    table = buzhash64_init_table(key)
    sum = _buzhash64(<const unsigned char *> data, len(data), table)
    free(table)
    return sum


-def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
+def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, bytes key):
    cdef uint64_t *table
-    table = buzhash64_init_table(seed & 0xffffffffffffffff)
+    table = buzhash64_init_table(key)
    sum = _buzhash64_update(sum, remove, add, len, table)
    free(table)
    return sum
--- a/src/borg/testsuite/chunkers/buzhash64_self_test.py
+++ b/src/borg/testsuite/chunkers/buzhash64_self_test.py
@ -13,56 +13,56 @@ from . import cf
 class ChunkerBuzHash64TestCase(BaseTestCase):
    def test_chunkify64(self):
        data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y"
-        parts = cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
+        parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
        self.assert_equal(len(parts), 2)
        self.assert_equal(b"".join(parts), data)
-        self.assert_equal(cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
+        self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
        self.assert_equal(
-            cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"fo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"obaz"],
+            cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"],
        )
        self.assert_equal(
-            cf(ChunkerBuzHash64(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarboobazfoobarboobazfoobarboobaz"],
+            cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"],
        )
        self.assert_equal(
-            cf(ChunkerBuzHash64(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"],
+            cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
        )
        self.assert_equal(
-            cf(ChunkerBuzHash64(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobar", b"boobazfoo", b"barboobazfoo", b"barboobaz"],
+            cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
        )
        self.assert_equal(
-            cf(ChunkerBuzHash64(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
-        )
-        self.assert_equal(
-            cf(ChunkerBuzHash64(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
            [b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"],
        )
        self.assert_equal(
-            cf(ChunkerBuzHash64(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarboobazfoo", b"barboobazfoo", b"barboobaz"],
+            cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
        )
        self.assert_equal(
-            cf(ChunkerBuzHash64(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
+            cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"],
        )
        self.assert_equal(
-            cf(ChunkerBuzHash64(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
            [b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
        )
+        self.assert_equal(
+            cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
+        )

    def test_buzhash64(self):
-        self.assert_equal(buzhash64(b"abcdefghijklmnop", 0), 13314711829666336849)
-        self.assert_equal(buzhash64(b"abcdefghijklmnop", 1), 17807676237451361719)
-        expected = buzhash64(b"abcdefghijklmnop", 1)
-        previous = buzhash64(b"Xabcdefghijklmno", 1)
-        this = buzhash64_update(previous, ord("X"), ord("p"), 16, 1)
+        self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478)
+        self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910)
+        expected = buzhash64(b"abcdefghijklmnop", b"1")
+        previous = buzhash64(b"Xabcdefghijklmno", b"1")
+        this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1")
        self.assert_equal(this, expected)
        # Test with more than 63 bytes to make sure our barrel_shift macro works correctly
-        self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, 0), 592868834756664313)
+        self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899)

    def test_small_reads64(self):
        class SmallReadFile:
--- a/src/borg/testsuite/chunkers/buzhash64_test.py
+++ b/src/borg/testsuite/chunkers/buzhash64_test.py
@ -30,22 +30,22 @@ def test_chunkpoints64_unchanged():
                if minexp >= maxexp:
                    continue
                for maskbits in (4, 7, 10, 12):
-                    for seed in (1849058162, 1234567653):
+                    for key in (b"first_key", b"second_key"):
                        fh = BytesIO(data)
-                        chunker = ChunkerBuzHash64(seed, minexp, maxexp, maskbits, winsize)
+                        chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize)
                        chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
                        runs.append(H(b"".join(chunks)))

    # The "correct" hash below matches the existing chunker behavior.
    # Future chunker optimisations must not change this, or existing repos will bloat.
    overall_hash = H(b"".join(runs))
-    assert overall_hash == hex_to_bin("fa9002758c0358721404f55f3020bb56b987cb3cd9a688ff9641f4023215f4e7")
+    assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc")


 def test_buzhash64_chunksize_distribution():
    data = os.urandom(1048576)
    min_exp, max_exp, mask = 10, 16, 14  # chunk size target 16kiB, clip at 1kiB and 64kiB
-    chunker = ChunkerBuzHash64(0, min_exp, max_exp, mask, 4095)
+    chunker = ChunkerBuzHash64(b"", min_exp, max_exp, mask, 4095)
    f = BytesIO(data)
    chunks = cf(chunker.chunkify(f))
    del chunks[-1]  # get rid of the last chunk, it can be smaller than 2**min_exp