diff --git a/src/borg/chunkers/buzhash64.pyi b/src/borg/chunkers/buzhash64.pyi index 3414bd609..7ff85b0f8 100644 --- a/src/borg/chunkers/buzhash64.pyi +++ b/src/borg/chunkers/buzhash64.pyi @@ -6,6 +6,7 @@ API_VERSION: str def buzhash64(data: bytes, key: bytes) -> int: ... def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ... +def buzhash64_get_table(key: bytes) -> List[int]: ... class ChunkerBuzHash64: def __init__( diff --git a/src/borg/chunkers/buzhash64.pyx b/src/borg/chunkers/buzhash64.pyx index 0199406fe..e9e117427 100644 --- a/src/borg/chunkers/buzhash64.pyx +++ b/src/borg/chunkers/buzhash64.pyx @@ -3,8 +3,8 @@ API_VERSION = '1.2_01' import cython +import random import time -from hashlib import sha256 from cpython.bytes cimport PyBytes_AsString from libc.stdint cimport uint8_t, uint64_t @@ -40,14 +40,31 @@ cdef extern from *: @cython.boundscheck(False) # Deactivate bounds checking @cython.wraparound(False) # Deactivate negative indexing. cdef uint64_t* buzhash64_init_table(bytes key): - """Initialize the buzhash table using the given key.""" - cdef int i + """ + Generate a balanced pseudo-random table deterministically from a 256-bit key. + Balanced means that for each bit position 0..63, exactly 50% of the table values have the bit set to 1. + """ + # Create deterministic random number generator + rng = random.Random(int.from_bytes(key, 'big')) + + cdef int i, j, bit_pos cdef uint64_t* table = malloc(2048) # 256 * sizeof(uint64_t) + + # Initialize all values to 0 for i in range(256): - # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key: - v = f"{i:02x}".encode() + key - d64 = sha256(v).digest()[:8] - table[i] = int.from_bytes(d64, byteorder='little') + table[i] = 0 + + # For each bit position, deterministically assign exactly 128 positions to have that bit set + for bit_pos in range(64): + # Create a list of indices and shuffle deterministically + indices = list(range(256)) + rng.shuffle(indices) + + # Set the bit at bit_pos for the first 128 shuffled indices + for i in range(128): + j = indices[i] + table[j] |= (1ULL << bit_pos) + return table @@ -289,3 +306,14 @@ def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size sum = _buzhash64_update(sum, remove, add, len, table) free(table) return sum + + +def buzhash64_get_table(bytes key): + """Get the buzhash table generated from .""" + cdef uint64_t *table + cdef int i + table = buzhash64_init_table(key) + try: + return [table[i] for i in range(256)] + finally: + free(table) diff --git a/src/borg/testsuite/chunkers/buzhash64_self_test.py b/src/borg/testsuite/chunkers/buzhash64_self_test.py index 41198477d..6930491f7 100644 --- a/src/borg/testsuite/chunkers/buzhash64_self_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_self_test.py @@ -6,63 +6,69 @@ from io import BytesIO from ...chunkers import get_chunker from ...chunkers.buzhash64 import buzhash64, buzhash64_update, ChunkerBuzHash64 from ...constants import * # NOQA +from ...helpers import hex_to_bin from .. import BaseTestCase from . import cf +# from os.urandom(32) +key0 = hex_to_bin("ad9f89095817f0566337dc9ee292fcd59b70f054a8200151f1df5f21704824da") +key1 = hex_to_bin("f1088c7e9e6ae83557ad1558ff36c44a369ea719d1081c29684f52ffccb72cb8") +key2 = hex_to_bin("57174a65fde67fe127b18430525b50a58406f1bd6cc629535208c7832e181067") + class ChunkerBuzHash64TestCase(BaseTestCase): def test_chunkify64(self): data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y" - parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))) + parts = cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))) self.assert_equal(len(parts), 2) self.assert_equal(b"".join(parts), data) - self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), []) + self.assert_equal(cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), []) self.assert_equal( - cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"], + cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"], + cf(ChunkerBuzHash64(key1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobar", b"boob", b"az", b"foobar", b"boob", b"az", b"foobar", b"boobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"], + cf(ChunkerBuzHash64(key2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarb", b"oob", b"az", b"foobarb", b"oob", b"az", b"foobarb", b"oobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + cf(ChunkerBuzHash64(key0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarb", b"oobazf", b"oobarb", b"oobazf", b"oobarb", b"oobaz"], + ) + self.assert_equal( + cf(ChunkerBuzHash64(key1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), [b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"], + cf(ChunkerBuzHash64(key2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"], + cf(ChunkerBuzHash64(key0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarboobazf", b"oobarboobazf", b"oobarboobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"], + cf(ChunkerBuzHash64(key1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarbo", b"obazfoobarb", b"oobazfoobarb", b"oobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"], - ) - self.assert_equal( - cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"], + cf(ChunkerBuzHash64(key2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"], ) def test_buzhash64(self): - self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478) - self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910) - expected = buzhash64(b"abcdefghijklmnop", b"1") - previous = buzhash64(b"Xabcdefghijklmno", b"1") - this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1") + self.assert_equal(buzhash64(b"abcdefghijklmnop", key0), 15080163834872228739) + self.assert_equal(buzhash64(b"abcdefghijklmnop", key1), 9505908538285923444) + expected = buzhash64(b"abcdefghijklmnop", key0) + previous = buzhash64(b"Xabcdefghijklmno", key0) + this = buzhash64_update(previous, ord("X"), ord("p"), 16, key0) self.assert_equal(this, expected) # Test with more than 63 bytes to make sure our barrel_shift macro works correctly - self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899) + self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, key0), 1936382207158378368) def test_small_reads64(self): class SmallReadFile: diff --git a/src/borg/testsuite/chunkers/buzhash64_test.py b/src/borg/testsuite/chunkers/buzhash64_test.py index 7a0019732..363bc9e8f 100644 --- a/src/borg/testsuite/chunkers/buzhash64_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_test.py @@ -4,10 +4,16 @@ import os from . import cf from ...chunkers import ChunkerBuzHash64 +from ...chunkers.buzhash64 import buzhash64_get_table from ...constants import * # NOQA from ...helpers import hex_to_bin +# from os.urandom(32) +key0 = hex_to_bin("ad9f89095817f0566337dc9ee292fcd59b70f054a8200151f1df5f21704824da") +key1 = hex_to_bin("f1088c7e9e6ae83557ad1558ff36c44a369ea719d1081c29684f52ffccb72cb8") + + def H(data): return sha256(data).digest() @@ -39,7 +45,8 @@ def test_chunkpoints64_unchanged(): # The "correct" hash below matches the existing chunker behavior. # Future chunker optimisations must not change this, or existing repos will bloat. overall_hash = H(b"".join(runs)) - assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc") + print(overall_hash.hex()) + assert overall_hash == hex_to_bin("db4b37fbe0cb841d79cfbb52bff8ac2f11040bf83a7d389640c7afb314fc4bfb") def test_buzhash64_chunksize_distribution(): @@ -67,3 +74,27 @@ def test_buzhash64_chunksize_distribution(): # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size: assert min_count < 10 assert max_count < 10 + + +def test_buzhash64_table(): + # Test that the function returns a list of 256 integers + table0 = buzhash64_get_table(key0) + assert len(table0) == 256 + + # Test that all elements are integers + for value in table0: + assert isinstance(value, int) + + # Test that the function is deterministic (same key produces same table) + table0_again = buzhash64_get_table(key0) + assert table0 == table0_again + + # Test that different keys produce different tables + table1 = buzhash64_get_table(key1) + assert table0 != table1 + + # Test that the table has balanced bit distribution + # For each bit position 0..63, exactly 50% of the table values should have the bit set to 1 + for bit_pos in range(64): + bit_count = sum(1 for value in table0 if value & (1 << bit_pos)) + assert bit_count == 128 # 50% of 256 = 128