diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py index 463be44c8..5712c2ee8 100644 --- a/src/borg/chunkers/__init__.py +++ b/src/borg/chunkers/__init__.py @@ -13,7 +13,9 @@ def get_chunker(algo, *params, **kw): # key.chunk_seed only has 32bits seed = key.chunk_seed if key is not None else 0 # for buzhash64, we want a much longer key, so we derive it from the id key - bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"" + bh64_key = ( + key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"\0" * 32 + ) if algo == "buzhash": return Chunker(seed, *params, sparse=sparse) if algo == "buzhash64": diff --git a/src/borg/chunkers/buzhash64.pyx b/src/borg/chunkers/buzhash64.pyx index e9e117427..a93f15a25 100644 --- a/src/borg/chunkers/buzhash64.pyx +++ b/src/borg/chunkers/buzhash64.pyx @@ -3,7 +3,6 @@ API_VERSION = '1.2_01' import cython -import random import time from cpython.bytes cimport PyBytes_AsString @@ -11,6 +10,8 @@ from libc.stdint cimport uint8_t, uint64_t from libc.stdlib cimport malloc, free from libc.string cimport memcpy, memmove +from ..crypto.low_level import CSPRNG + from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros from .reader import FileReader, Chunk @@ -45,7 +46,7 @@ cdef uint64_t* buzhash64_init_table(bytes key): Balanced means that for each bit position 0..63, exactly 50% of the table values have the bit set to 1. """ # Create deterministic random number generator - rng = random.Random(int.from_bytes(key, 'big')) + rng = CSPRNG(key) cdef int i, j, bit_pos cdef uint64_t* table = malloc(2048) # 256 * sizeof(uint64_t) diff --git a/src/borg/testsuite/chunkers/buzhash64_self_test.py b/src/borg/testsuite/chunkers/buzhash64_self_test.py index 6930491f7..03b5a8bfa 100644 --- a/src/borg/testsuite/chunkers/buzhash64_self_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_self_test.py @@ -25,50 +25,50 @@ class ChunkerBuzHash64TestCase(BaseTestCase): self.assert_equal(cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), []) self.assert_equal( cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"], + [b"foobarb", b"ooba", b"zf", b"oobarb", b"ooba", b"zf", b"oobarb", b"oobaz"], ) self.assert_equal( cf(ChunkerBuzHash64(key1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobar", b"boob", b"az", b"foobar", b"boob", b"az", b"foobar", b"boobaz"], + [b"fo", b"oba", b"rb", b"oob", b"azf", b"ooba", b"rb", b"oob", b"azf", b"ooba", b"rb", b"oobaz"], ) self.assert_equal( cf(ChunkerBuzHash64(key2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarb", b"oob", b"az", b"foobarb", b"oob", b"az", b"foobarb", b"oobaz"], + [b"foobar", b"booba", b"zfoobar", b"booba", b"zfoobar", b"boobaz"], ) self.assert_equal( cf(ChunkerBuzHash64(key0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarb", b"oobazf", b"oobarb", b"oobazf", b"oobarb", b"oobaz"], + [b"foobarbo", b"obaz", b"foobarbo", b"obaz", b"foobarbo", b"obaz"], ) self.assert_equal( cf(ChunkerBuzHash64(key1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"], + [b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"], ) self.assert_equal( cf(ChunkerBuzHash64(key2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"], + [b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"], ) self.assert_equal( cf(ChunkerBuzHash64(key0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarboobazf", b"oobarboobazf", b"oobarboobaz"], + [b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"], ) self.assert_equal( cf(ChunkerBuzHash64(key1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarbo", b"obazfoobarb", b"oobazfoobarb", b"oobaz"], + [b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"], ) self.assert_equal( cf(ChunkerBuzHash64(key2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"], + [b"foobarboobazfoob", b"arboobazfoob", b"arboobaz"], ) def test_buzhash64(self): - self.assert_equal(buzhash64(b"abcdefghijklmnop", key0), 15080163834872228739) - self.assert_equal(buzhash64(b"abcdefghijklmnop", key1), 9505908538285923444) + self.assert_equal(buzhash64(b"abcdefghijklmnop", key0), 17414563089559790077) + self.assert_equal(buzhash64(b"abcdefghijklmnop", key1), 1397285894609271345) expected = buzhash64(b"abcdefghijklmnop", key0) previous = buzhash64(b"Xabcdefghijklmno", key0) this = buzhash64_update(previous, ord("X"), ord("p"), 16, key0) self.assert_equal(this, expected) # Test with more than 63 bytes to make sure our barrel_shift macro works correctly - self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, key0), 1936382207158378368) + self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, key0), 17683050804041322250) def test_small_reads64(self): class SmallReadFile: diff --git a/src/borg/testsuite/chunkers/buzhash64_test.py b/src/borg/testsuite/chunkers/buzhash64_test.py index 363bc9e8f..1aafdbf70 100644 --- a/src/borg/testsuite/chunkers/buzhash64_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_test.py @@ -36,7 +36,7 @@ def test_chunkpoints64_unchanged(): if minexp >= maxexp: continue for maskbits in (4, 7, 10, 12): - for key in (b"first_key", b"second_key"): + for key in (key0, key1): fh = BytesIO(data) chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize) chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))] @@ -46,13 +46,13 @@ def test_chunkpoints64_unchanged(): # Future chunker optimisations must not change this, or existing repos will bloat. overall_hash = H(b"".join(runs)) print(overall_hash.hex()) - assert overall_hash == hex_to_bin("db4b37fbe0cb841d79cfbb52bff8ac2f11040bf83a7d389640c7afb314fc4bfb") + assert overall_hash == hex_to_bin("676676133fb3621ada0f6cc1b18002c3e37016c9469217d18f8e382fadaf23fd") def test_buzhash64_chunksize_distribution(): data = os.urandom(1048576) min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB - chunker = ChunkerBuzHash64(b"", min_exp, max_exp, mask, 4095) + chunker = ChunkerBuzHash64(key0, min_exp, max_exp, mask, 4095) f = BytesIO(data) chunks = cf(chunker.chunkify(f)) del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp