mirror of
https://github.com/borgbackup/borg.git
synced 2026-05-28 04:03:21 -04:00
buzhash64: init table using a 256bit key derived from ID key
That way we can feed lots of entropy into the table creation. The bh64_key is derived from the id_key (NOT the crypt_key), thus it will create the same key for related repositories (even if they use different encryption/authentication keys). Due to that, it will also create the same buzhash64 table, will cut chunks at the same points and deduplication will work amongst the related repositories.
This commit is contained in:
parent
544b3f41a9
commit
b9646f236e
5 changed files with 46 additions and 50 deletions
|
|
@ -3,7 +3,6 @@ from .buzhash64 import ChunkerBuzHash64
|
|||
from .failing import ChunkerFailing
|
||||
from .fixed import ChunkerFixed
|
||||
from .reader import * # noqa
|
||||
from ..crypto.key import PlaintextKey
|
||||
|
||||
API_VERSION = "1.2_01"
|
||||
|
||||
|
|
@ -13,15 +12,12 @@ def get_chunker(algo, *params, **kw):
|
|||
sparse = kw.get("sparse", False)
|
||||
# key.chunk_seed only has 32bits
|
||||
seed = key.chunk_seed if key is not None else 0
|
||||
# we want 64bits for buzhash64, get them from crypt_key
|
||||
if key is None or isinstance(key, PlaintextKey):
|
||||
seed64 = 0
|
||||
else:
|
||||
seed64 = int.from_bytes(key.crypt_key[:8], byteorder="little")
|
||||
# for buzhash64, we want a much longer key, so we derive it from the id key
|
||||
bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b""
|
||||
if algo == "buzhash":
|
||||
return Chunker(seed, *params, sparse=sparse)
|
||||
if algo == "buzhash64":
|
||||
return ChunkerBuzHash64(seed64, *params, sparse=sparse)
|
||||
return ChunkerBuzHash64(bh64_key, *params, sparse=sparse)
|
||||
if algo == "fixed":
|
||||
return ChunkerFixed(*params, sparse=sparse)
|
||||
if algo == "fail":
|
||||
|
|
|
|||
|
|
@ -4,13 +4,13 @@ from .reader import fmap_entry
|
|||
|
||||
API_VERSION: str
|
||||
|
||||
def buzhash64(data: bytes, seed: int) -> int: ...
|
||||
def buzhash64_update(sum: int, remove: int, add: int, len: int, seed: int) -> int: ...
|
||||
def buzhash64(data: bytes, key: bytes) -> int: ...
|
||||
def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ...
|
||||
|
||||
class ChunkerBuzHash64:
|
||||
def __init__(
|
||||
self,
|
||||
seed: int,
|
||||
key: bytes,
|
||||
chunk_min_exp: int,
|
||||
chunk_max_exp: int,
|
||||
hash_mask_bits: int,
|
||||
|
|
|
|||
|
|
@ -39,13 +39,13 @@ cdef extern from *:
|
|||
|
||||
@cython.boundscheck(False) # Deactivate bounds checking
|
||||
@cython.wraparound(False) # Deactivate negative indexing.
|
||||
cdef uint64_t* buzhash64_init_table(uint64_t seed):
|
||||
"""Initialize the buzhash table with the given seed."""
|
||||
cdef uint64_t* buzhash64_init_table(bytes key):
|
||||
"""Initialize the buzhash table using the given key."""
|
||||
cdef int i
|
||||
cdef uint64_t* table = <uint64_t*>malloc(2048) # 256 * sizeof(uint64_t)
|
||||
for i in range(256):
|
||||
# deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the seed:
|
||||
v = f"{i:02x}{seed:016x}".encode()
|
||||
# deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key:
|
||||
v = f"{i:02x}".encode() + key
|
||||
d64 = sha256(v).digest()[:8]
|
||||
table[i] = <uint64_t> int.from_bytes(d64, byteorder='little')
|
||||
return table
|
||||
|
|
@ -99,7 +99,7 @@ cdef class ChunkerBuzHash64:
|
|||
cdef size_t reader_block_size
|
||||
cdef bint sparse
|
||||
|
||||
def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
|
||||
def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
|
||||
min_size = 1 << chunk_min_exp
|
||||
max_size = 1 << chunk_max_exp
|
||||
assert max_size <= len(zeros)
|
||||
|
|
@ -109,7 +109,7 @@ cdef class ChunkerBuzHash64:
|
|||
self.window_size = hash_window_size
|
||||
self.chunk_mask = (1 << hash_mask_bits) - 1
|
||||
self.min_size = min_size
|
||||
self.table = buzhash64_init_table(seed & 0xffffffffffffffff)
|
||||
self.table = buzhash64_init_table(key)
|
||||
self.buf_size = max_size
|
||||
self.data = <uint8_t*>malloc(self.buf_size)
|
||||
self.fh = -1
|
||||
|
|
@ -274,18 +274,18 @@ cdef class ChunkerBuzHash64:
|
|||
return Chunk(data, size=got, allocation=allocation)
|
||||
|
||||
|
||||
def buzhash64(data, unsigned long seed):
|
||||
def buzhash64(data, bytes key):
|
||||
cdef uint64_t *table
|
||||
cdef uint64_t sum
|
||||
table = buzhash64_init_table(seed & 0xffffffffffffffff)
|
||||
table = buzhash64_init_table(key)
|
||||
sum = _buzhash64(<const unsigned char *> data, len(data), table)
|
||||
free(table)
|
||||
return sum
|
||||
|
||||
|
||||
def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
|
||||
def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, bytes key):
|
||||
cdef uint64_t *table
|
||||
table = buzhash64_init_table(seed & 0xffffffffffffffff)
|
||||
table = buzhash64_init_table(key)
|
||||
sum = _buzhash64_update(sum, remove, add, len, table)
|
||||
free(table)
|
||||
return sum
|
||||
|
|
|
|||
|
|
@ -13,56 +13,56 @@ from . import cf
|
|||
class ChunkerBuzHash64TestCase(BaseTestCase):
|
||||
def test_chunkify64(self):
|
||||
data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y"
|
||||
parts = cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
|
||||
parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
|
||||
self.assert_equal(len(parts), 2)
|
||||
self.assert_equal(b"".join(parts), data)
|
||||
self.assert_equal(cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
|
||||
self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"fo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"obaz"],
|
||||
cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarboobazfoobarboobazfoobarboobaz"],
|
||||
cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"],
|
||||
cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobar", b"boobazfoo", b"barboobazfoo", b"barboobaz"],
|
||||
cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarboobazfoo", b"barboobazfoo", b"barboobaz"],
|
||||
cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
|
||||
cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
|
||||
)
|
||||
|
||||
def test_buzhash64(self):
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnop", 0), 13314711829666336849)
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnop", 1), 17807676237451361719)
|
||||
expected = buzhash64(b"abcdefghijklmnop", 1)
|
||||
previous = buzhash64(b"Xabcdefghijklmno", 1)
|
||||
this = buzhash64_update(previous, ord("X"), ord("p"), 16, 1)
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478)
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910)
|
||||
expected = buzhash64(b"abcdefghijklmnop", b"1")
|
||||
previous = buzhash64(b"Xabcdefghijklmno", b"1")
|
||||
this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1")
|
||||
self.assert_equal(this, expected)
|
||||
# Test with more than 63 bytes to make sure our barrel_shift macro works correctly
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, 0), 592868834756664313)
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899)
|
||||
|
||||
def test_small_reads64(self):
|
||||
class SmallReadFile:
|
||||
|
|
|
|||
|
|
@ -30,22 +30,22 @@ def test_chunkpoints64_unchanged():
|
|||
if minexp >= maxexp:
|
||||
continue
|
||||
for maskbits in (4, 7, 10, 12):
|
||||
for seed in (1849058162, 1234567653):
|
||||
for key in (b"first_key", b"second_key"):
|
||||
fh = BytesIO(data)
|
||||
chunker = ChunkerBuzHash64(seed, minexp, maxexp, maskbits, winsize)
|
||||
chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize)
|
||||
chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
|
||||
runs.append(H(b"".join(chunks)))
|
||||
|
||||
# The "correct" hash below matches the existing chunker behavior.
|
||||
# Future chunker optimisations must not change this, or existing repos will bloat.
|
||||
overall_hash = H(b"".join(runs))
|
||||
assert overall_hash == hex_to_bin("fa9002758c0358721404f55f3020bb56b987cb3cd9a688ff9641f4023215f4e7")
|
||||
assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc")
|
||||
|
||||
|
||||
def test_buzhash64_chunksize_distribution():
|
||||
data = os.urandom(1048576)
|
||||
min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB
|
||||
chunker = ChunkerBuzHash64(0, min_exp, max_exp, mask, 4095)
|
||||
chunker = ChunkerBuzHash64(b"", min_exp, max_exp, mask, 4095)
|
||||
f = BytesIO(data)
|
||||
chunks = cf(chunker.chunkify(f))
|
||||
del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp
|
||||
|
|
|
|||
Loading…
Reference in a new issue