buzhash64: use own CSPRNG

This commit is contained in:
Thomas Waldmann 2025-06-15 15:53:02 +02:00
parent bb7a4647ea
commit 3617b63336
No known key found for this signature in database
GPG key ID: 243ACFA951F78E01
4 changed files with 21 additions and 18 deletions

View file

@ -13,7 +13,9 @@ def get_chunker(algo, *params, **kw):
# key.chunk_seed only has 32bits
seed = key.chunk_seed if key is not None else 0
# for buzhash64, we want a much longer key, so we derive it from the id key
bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b""
bh64_key = (
key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"\0" * 32
)
if algo == "buzhash":
return Chunker(seed, *params, sparse=sparse)
if algo == "buzhash64":

View file

@ -3,7 +3,6 @@
API_VERSION = '1.2_01'
import cython
import random
import time
from cpython.bytes cimport PyBytes_AsString
@ -11,6 +10,8 @@ from libc.stdint cimport uint8_t, uint64_t
from libc.stdlib cimport malloc, free
from libc.string cimport memcpy, memmove
from ..crypto.low_level import CSPRNG
from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
from .reader import FileReader, Chunk
@ -45,7 +46,7 @@ cdef uint64_t* buzhash64_init_table(bytes key):
Balanced means that for each bit position 0..63, exactly 50% of the table values have the bit set to 1.
"""
# Create deterministic random number generator
rng = random.Random(int.from_bytes(key, 'big'))
rng = CSPRNG(key)
cdef int i, j, bit_pos
cdef uint64_t* table = <uint64_t*>malloc(2048) # 256 * sizeof(uint64_t)

View file

@ -25,50 +25,50 @@ class ChunkerBuzHash64TestCase(BaseTestCase):
self.assert_equal(cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
self.assert_equal(
cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
[b"foobarb", b"ooba", b"zf", b"oobarb", b"ooba", b"zf", b"oobarb", b"oobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(key1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobar", b"boob", b"az", b"foobar", b"boob", b"az", b"foobar", b"boobaz"],
[b"fo", b"oba", b"rb", b"oob", b"azf", b"ooba", b"rb", b"oob", b"azf", b"ooba", b"rb", b"oobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(key2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarb", b"oob", b"az", b"foobarb", b"oob", b"az", b"foobarb", b"oobaz"],
[b"foobar", b"booba", b"zfoobar", b"booba", b"zfoobar", b"boobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(key0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarb", b"oobazf", b"oobarb", b"oobazf", b"oobarb", b"oobaz"],
[b"foobarbo", b"obaz", b"foobarbo", b"obaz", b"foobarbo", b"obaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(key1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
[b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(key2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
[b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(key0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarboobazf", b"oobarboobazf", b"oobarboobaz"],
[b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(key1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarbo", b"obazfoobarb", b"oobazfoobarb", b"oobaz"],
[b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(key2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
[b"foobarboobazfoob", b"arboobazfoob", b"arboobaz"],
)
def test_buzhash64(self):
self.assert_equal(buzhash64(b"abcdefghijklmnop", key0), 15080163834872228739)
self.assert_equal(buzhash64(b"abcdefghijklmnop", key1), 9505908538285923444)
self.assert_equal(buzhash64(b"abcdefghijklmnop", key0), 17414563089559790077)
self.assert_equal(buzhash64(b"abcdefghijklmnop", key1), 1397285894609271345)
expected = buzhash64(b"abcdefghijklmnop", key0)
previous = buzhash64(b"Xabcdefghijklmno", key0)
this = buzhash64_update(previous, ord("X"), ord("p"), 16, key0)
self.assert_equal(this, expected)
# Test with more than 63 bytes to make sure our barrel_shift macro works correctly
self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, key0), 1936382207158378368)
self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, key0), 17683050804041322250)
def test_small_reads64(self):
class SmallReadFile:

View file

@ -36,7 +36,7 @@ def test_chunkpoints64_unchanged():
if minexp >= maxexp:
continue
for maskbits in (4, 7, 10, 12):
for key in (b"first_key", b"second_key"):
for key in (key0, key1):
fh = BytesIO(data)
chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize)
chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
@ -46,13 +46,13 @@ def test_chunkpoints64_unchanged():
# Future chunker optimisations must not change this, or existing repos will bloat.
overall_hash = H(b"".join(runs))
print(overall_hash.hex())
assert overall_hash == hex_to_bin("db4b37fbe0cb841d79cfbb52bff8ac2f11040bf83a7d389640c7afb314fc4bfb")
assert overall_hash == hex_to_bin("676676133fb3621ada0f6cc1b18002c3e37016c9469217d18f8e382fadaf23fd")
def test_buzhash64_chunksize_distribution():
data = os.urandom(1048576)
min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB
chunker = ChunkerBuzHash64(b"", min_exp, max_exp, mask, 4095)
chunker = ChunkerBuzHash64(key0, min_exp, max_exp, mask, 4095)
f = BytesIO(data)
chunks = cf(chunker.chunkify(f))
del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp