From b9646f236ebd1a16cb49a8e8adaf550eea2391ac Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 6 Jun 2025 22:39:25 +0200
Subject: [PATCH] buzhash64: init table using a 256bit key derived from ID key

That way we can feed lots of entropy into the table creation.

The bh64_key is derived from the id_key (NOT the crypt_key), thus
it will create the same key for related repositories (even if they
use different encryption/authentication keys). Due to that, it will
also create the same buzhash64 table, will cut chunks at the same
points and deduplication will work amongst the related repositories.
---
 src/borg/chunkers/__init__.py                 | 10 ++--
 src/borg/chunkers/buzhash64.pyi               |  6 +--
 src/borg/chunkers/buzhash64.pyx               | 20 +++----
 .../testsuite/chunkers/buzhash64_self_test.py | 52 +++++++++----------
 src/borg/testsuite/chunkers/buzhash64_test.py |  8 +--
 5 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py
index c3c625760..463be44c8 100644
--- a/src/borg/chunkers/__init__.py
+++ b/src/borg/chunkers/__init__.py
@@ -3,7 +3,6 @@ from .buzhash64 import ChunkerBuzHash64
 from .failing import ChunkerFailing
 from .fixed import ChunkerFixed
 from .reader import *  # noqa
-from ..crypto.key import PlaintextKey
 
 API_VERSION = "1.2_01"
 
@@ -13,15 +12,12 @@ def get_chunker(algo, *params, **kw):
     sparse = kw.get("sparse", False)
     # key.chunk_seed only has 32bits
     seed = key.chunk_seed if key is not None else 0
-    # we want 64bits for buzhash64, get them from crypt_key
-    if key is None or isinstance(key, PlaintextKey):
-        seed64 = 0
-    else:
-        seed64 = int.from_bytes(key.crypt_key[:8], byteorder="little")
+    # for buzhash64, we want a much longer key, so we derive it from the id key
+    bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b""
     if algo == "buzhash":
         return Chunker(seed, *params, sparse=sparse)
     if algo == "buzhash64":
-        return ChunkerBuzHash64(seed64, *params, sparse=sparse)
+        return ChunkerBuzHash64(bh64_key, *params, sparse=sparse)
     if algo == "fixed":
         return ChunkerFixed(*params, sparse=sparse)
     if algo == "fail":
diff --git a/src/borg/chunkers/buzhash64.pyi b/src/borg/chunkers/buzhash64.pyi
index 4ca5359d3..3414bd609 100644
--- a/src/borg/chunkers/buzhash64.pyi
+++ b/src/borg/chunkers/buzhash64.pyi
@@ -4,13 +4,13 @@ from .reader import fmap_entry
 
 API_VERSION: str
 
-def buzhash64(data: bytes, seed: int) -> int: ...
-def buzhash64_update(sum: int, remove: int, add: int, len: int, seed: int) -> int: ...
+def buzhash64(data: bytes, key: bytes) -> int: ...
+def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ...
 
 class ChunkerBuzHash64:
     def __init__(
         self,
-        seed: int,
+        key: bytes,
         chunk_min_exp: int,
         chunk_max_exp: int,
         hash_mask_bits: int,
diff --git a/src/borg/chunkers/buzhash64.pyx b/src/borg/chunkers/buzhash64.pyx
index db264d74a..0199406fe 100644
--- a/src/borg/chunkers/buzhash64.pyx
+++ b/src/borg/chunkers/buzhash64.pyx
@@ -39,13 +39,13 @@ cdef extern from *:
 
 @cython.boundscheck(False)  # Deactivate bounds checking
 @cython.wraparound(False)  # Deactivate negative indexing.
-cdef uint64_t* buzhash64_init_table(uint64_t seed):
-    """Initialize the buzhash table with the given seed."""
+cdef uint64_t* buzhash64_init_table(bytes key):
+    """Initialize the buzhash table using the given key."""
     cdef int i
     cdef uint64_t* table = <uint64_t*>malloc(2048)  # 256 * sizeof(uint64_t)
     for i in range(256):
-        # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the seed:
-        v = f"{i:02x}{seed:016x}".encode()
+        # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key:
+        v = f"{i:02x}".encode() + key
         d64 = sha256(v).digest()[:8]
         table[i] = <uint64_t> int.from_bytes(d64, byteorder='little')
     return table
@@ -99,7 +99,7 @@ cdef class ChunkerBuzHash64:
     cdef size_t reader_block_size
     cdef bint sparse
 
-    def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
+    def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
         min_size = 1 << chunk_min_exp
         max_size = 1 << chunk_max_exp
         assert max_size <= len(zeros)
@@ -109,7 +109,7 @@ cdef class ChunkerBuzHash64:
         self.window_size = hash_window_size
         self.chunk_mask = (1 << hash_mask_bits) - 1
         self.min_size = min_size
-        self.table = buzhash64_init_table(seed & 0xffffffffffffffff)
+        self.table = buzhash64_init_table(key)
         self.buf_size = max_size
         self.data = <uint8_t*>malloc(self.buf_size)
         self.fh = -1
@@ -274,18 +274,18 @@ cdef class ChunkerBuzHash64:
         return Chunk(data, size=got, allocation=allocation)
 
 
-def buzhash64(data, unsigned long seed):
+def buzhash64(data, bytes key):
     cdef uint64_t *table
     cdef uint64_t sum
-    table = buzhash64_init_table(seed & 0xffffffffffffffff)
+    table = buzhash64_init_table(key)
     sum = _buzhash64(<const unsigned char *> data, len(data), table)
     free(table)
     return sum
 
 
-def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
+def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, bytes key):
     cdef uint64_t *table
-    table = buzhash64_init_table(seed & 0xffffffffffffffff)
+    table = buzhash64_init_table(key)
     sum = _buzhash64_update(sum, remove, add, len, table)
     free(table)
     return sum
diff --git a/src/borg/testsuite/chunkers/buzhash64_self_test.py b/src/borg/testsuite/chunkers/buzhash64_self_test.py
index a356afbbf..41198477d 100644
--- a/src/borg/testsuite/chunkers/buzhash64_self_test.py
+++ b/src/borg/testsuite/chunkers/buzhash64_self_test.py
@@ -13,56 +13,56 @@ from . import cf
 class ChunkerBuzHash64TestCase(BaseTestCase):
     def test_chunkify64(self):
         data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y"
-        parts = cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
+        parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
         self.assert_equal(len(parts), 2)
         self.assert_equal(b"".join(parts), data)
-        self.assert_equal(cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
+        self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
         self.assert_equal(
-            cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"fo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"obaz"],
+            cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarboobazfoobarboobazfoobarboobaz"],
+            cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"],
+            cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobar", b"boobazfoo", b"barboobazfoo", b"barboobaz"],
+            cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
-        )
-        self.assert_equal(
-            cf(ChunkerBuzHash64(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
             [b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarboobazfoo", b"barboobazfoo", b"barboobaz"],
+            cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
-            [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
+            cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"],
         )
         self.assert_equal(
-            cf(ChunkerBuzHash64(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
             [b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
         )
+        self.assert_equal(
+            cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
+        )
 
     def test_buzhash64(self):
-        self.assert_equal(buzhash64(b"abcdefghijklmnop", 0), 13314711829666336849)
-        self.assert_equal(buzhash64(b"abcdefghijklmnop", 1), 17807676237451361719)
-        expected = buzhash64(b"abcdefghijklmnop", 1)
-        previous = buzhash64(b"Xabcdefghijklmno", 1)
-        this = buzhash64_update(previous, ord("X"), ord("p"), 16, 1)
+        self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478)
+        self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910)
+        expected = buzhash64(b"abcdefghijklmnop", b"1")
+        previous = buzhash64(b"Xabcdefghijklmno", b"1")
+        this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1")
         self.assert_equal(this, expected)
         # Test with more than 63 bytes to make sure our barrel_shift macro works correctly
-        self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, 0), 592868834756664313)
+        self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899)
 
     def test_small_reads64(self):
         class SmallReadFile:
diff --git a/src/borg/testsuite/chunkers/buzhash64_test.py b/src/borg/testsuite/chunkers/buzhash64_test.py
index fef302838..7a0019732 100644
--- a/src/borg/testsuite/chunkers/buzhash64_test.py
+++ b/src/borg/testsuite/chunkers/buzhash64_test.py
@@ -30,22 +30,22 @@ def test_chunkpoints64_unchanged():
                 if minexp >= maxexp:
                     continue
                 for maskbits in (4, 7, 10, 12):
-                    for seed in (1849058162, 1234567653):
+                    for key in (b"first_key", b"second_key"):
                         fh = BytesIO(data)
-                        chunker = ChunkerBuzHash64(seed, minexp, maxexp, maskbits, winsize)
+                        chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize)
                         chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
                         runs.append(H(b"".join(chunks)))
 
     # The "correct" hash below matches the existing chunker behavior.
     # Future chunker optimisations must not change this, or existing repos will bloat.
     overall_hash = H(b"".join(runs))
-    assert overall_hash == hex_to_bin("fa9002758c0358721404f55f3020bb56b987cb3cd9a688ff9641f4023215f4e7")
+    assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc")
 
 
 def test_buzhash64_chunksize_distribution():
     data = os.urandom(1048576)
     min_exp, max_exp, mask = 10, 16, 14  # chunk size target 16kiB, clip at 1kiB and 64kiB
-    chunker = ChunkerBuzHash64(0, min_exp, max_exp, mask, 4095)
+    chunker = ChunkerBuzHash64(b"", min_exp, max_exp, mask, 4095)
     f = BytesIO(data)
     chunks = cf(chunker.chunkify(f))
     del chunks[-1]  # get rid of the last chunk, it can be smaller than 2**min_exp