buzhash64: deterministically create a balanced bh table

the previous approach had cryptographic strength randomness, but a precise
50:50 0/1 bit distribution per bit position in the table was not assured.

now this is always the case due to the way how the table is constructed.
This commit is contained in:
Thomas Waldmann 2025-06-15 11:03:28 +02:00
parent d04f41b886
commit d48c9643e8
No known key found for this signature in database
GPG key ID: 243ACFA951F78E01
4 changed files with 101 additions and 35 deletions

View file

@ -6,6 +6,7 @@ API_VERSION: str
def buzhash64(data: bytes, key: bytes) -> int: ...
def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ...
def buzhash64_get_table(key: bytes) -> List[int]: ...
class ChunkerBuzHash64:
def __init__(

View file

@ -3,8 +3,8 @@
API_VERSION = '1.2_01'
import cython
import random
import time
from hashlib import sha256
from cpython.bytes cimport PyBytes_AsString
from libc.stdint cimport uint8_t, uint64_t
@ -40,14 +40,31 @@ cdef extern from *:
@cython.boundscheck(False) # Deactivate bounds checking
@cython.wraparound(False) # Deactivate negative indexing.
cdef uint64_t* buzhash64_init_table(bytes key):
"""Initialize the buzhash table using the given key."""
cdef int i
"""
Generate a balanced pseudo-random table deterministically from a 256-bit key.
Balanced means that for each bit position 0..63, exactly 50% of the table values have the bit set to 1.
"""
# Create deterministic random number generator
rng = random.Random(int.from_bytes(key, 'big'))
cdef int i, j, bit_pos
cdef uint64_t* table = <uint64_t*>malloc(2048) # 256 * sizeof(uint64_t)
# Initialize all values to 0
for i in range(256):
# deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key:
v = f"{i:02x}".encode() + key
d64 = sha256(v).digest()[:8]
table[i] = <uint64_t> int.from_bytes(d64, byteorder='little')
table[i] = 0
# For each bit position, deterministically assign exactly 128 positions to have that bit set
for bit_pos in range(64):
# Create a list of indices and shuffle deterministically
indices = list(range(256))
rng.shuffle(indices)
# Set the bit at bit_pos for the first 128 shuffled indices
for i in range(128):
j = indices[i]
table[j] |= (1ULL << bit_pos)
return table
@ -289,3 +306,14 @@ def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size
sum = _buzhash64_update(sum, remove, add, len, table)
free(table)
return sum
def buzhash64_get_table(bytes key):
"""Get the buzhash table generated from <key>."""
cdef uint64_t *table
cdef int i
table = buzhash64_init_table(key)
try:
return [table[i] for i in range(256)]
finally:
free(table)

View file

@ -6,63 +6,69 @@ from io import BytesIO
from ...chunkers import get_chunker
from ...chunkers.buzhash64 import buzhash64, buzhash64_update, ChunkerBuzHash64
from ...constants import * # NOQA
from ...helpers import hex_to_bin
from .. import BaseTestCase
from . import cf
# from os.urandom(32)
key0 = hex_to_bin("ad9f89095817f0566337dc9ee292fcd59b70f054a8200151f1df5f21704824da")
key1 = hex_to_bin("f1088c7e9e6ae83557ad1558ff36c44a369ea719d1081c29684f52ffccb72cb8")
key2 = hex_to_bin("57174a65fde67fe127b18430525b50a58406f1bd6cc629535208c7832e181067")
class ChunkerBuzHash64TestCase(BaseTestCase):
def test_chunkify64(self):
data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y"
parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
parts = cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
self.assert_equal(len(parts), 2)
self.assert_equal(b"".join(parts), data)
self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
self.assert_equal(cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
self.assert_equal(
cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"],
cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"],
cf(ChunkerBuzHash64(key1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobar", b"boob", b"az", b"foobar", b"boob", b"az", b"foobar", b"boobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
cf(ChunkerBuzHash64(key2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarb", b"oob", b"az", b"foobarb", b"oob", b"az", b"foobarb", b"oobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
cf(ChunkerBuzHash64(key0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarb", b"oobazf", b"oobarb", b"oobazf", b"oobarb", b"oobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(key1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"],
cf(ChunkerBuzHash64(key2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
cf(ChunkerBuzHash64(key0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarboobazf", b"oobarboobazf", b"oobarboobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"],
cf(ChunkerBuzHash64(key1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarbo", b"obazfoobarb", b"oobazfoobarb", b"oobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
cf(ChunkerBuzHash64(key2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
)
def test_buzhash64(self):
self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478)
self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910)
expected = buzhash64(b"abcdefghijklmnop", b"1")
previous = buzhash64(b"Xabcdefghijklmno", b"1")
this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1")
self.assert_equal(buzhash64(b"abcdefghijklmnop", key0), 15080163834872228739)
self.assert_equal(buzhash64(b"abcdefghijklmnop", key1), 9505908538285923444)
expected = buzhash64(b"abcdefghijklmnop", key0)
previous = buzhash64(b"Xabcdefghijklmno", key0)
this = buzhash64_update(previous, ord("X"), ord("p"), 16, key0)
self.assert_equal(this, expected)
# Test with more than 63 bytes to make sure our barrel_shift macro works correctly
self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899)
self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, key0), 1936382207158378368)
def test_small_reads64(self):
class SmallReadFile:

View file

@ -4,10 +4,16 @@ import os
from . import cf
from ...chunkers import ChunkerBuzHash64
from ...chunkers.buzhash64 import buzhash64_get_table
from ...constants import * # NOQA
from ...helpers import hex_to_bin
# from os.urandom(32)
key0 = hex_to_bin("ad9f89095817f0566337dc9ee292fcd59b70f054a8200151f1df5f21704824da")
key1 = hex_to_bin("f1088c7e9e6ae83557ad1558ff36c44a369ea719d1081c29684f52ffccb72cb8")
def H(data):
return sha256(data).digest()
@ -39,7 +45,8 @@ def test_chunkpoints64_unchanged():
# The "correct" hash below matches the existing chunker behavior.
# Future chunker optimisations must not change this, or existing repos will bloat.
overall_hash = H(b"".join(runs))
assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc")
print(overall_hash.hex())
assert overall_hash == hex_to_bin("db4b37fbe0cb841d79cfbb52bff8ac2f11040bf83a7d389640c7afb314fc4bfb")
def test_buzhash64_chunksize_distribution():
@ -67,3 +74,27 @@ def test_buzhash64_chunksize_distribution():
# most chunks should be cut due to buzhash triggering, not due to clipping at min/max size:
assert min_count < 10
assert max_count < 10
def test_buzhash64_table():
# Test that the function returns a list of 256 integers
table0 = buzhash64_get_table(key0)
assert len(table0) == 256
# Test that all elements are integers
for value in table0:
assert isinstance(value, int)
# Test that the function is deterministic (same key produces same table)
table0_again = buzhash64_get_table(key0)
assert table0 == table0_again
# Test that different keys produce different tables
table1 = buzhash64_get_table(key1)
assert table0 != table1
# Test that the table has balanced bit distribution
# For each bit position 0..63, exactly 50% of the table values should have the bit set to 1
for bit_pos in range(64):
bit_count = sum(1 for value in table0 if value & (1 << bit_pos))
assert bit_count == 128 # 50% of 256 = 128