mirror of
https://github.com/borgbackup/borg.git
synced 2026-06-11 01:41:57 -04:00
buzhash64: deterministically create a balanced bh table
the previous approach had cryptographic strength randomness, but a precise 50:50 0/1 bit distribution per bit position in the table was not assured. now this is always the case due to the way how the table is constructed.
This commit is contained in:
parent
d04f41b886
commit
d48c9643e8
4 changed files with 101 additions and 35 deletions
|
|
@ -6,6 +6,7 @@ API_VERSION: str
|
|||
|
||||
def buzhash64(data: bytes, key: bytes) -> int: ...
|
||||
def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ...
|
||||
def buzhash64_get_table(key: bytes) -> List[int]: ...
|
||||
|
||||
class ChunkerBuzHash64:
|
||||
def __init__(
|
||||
|
|
|
|||
|
|
@ -3,8 +3,8 @@
|
|||
API_VERSION = '1.2_01'
|
||||
|
||||
import cython
|
||||
import random
|
||||
import time
|
||||
from hashlib import sha256
|
||||
|
||||
from cpython.bytes cimport PyBytes_AsString
|
||||
from libc.stdint cimport uint8_t, uint64_t
|
||||
|
|
@ -40,14 +40,31 @@ cdef extern from *:
|
|||
@cython.boundscheck(False) # Deactivate bounds checking
|
||||
@cython.wraparound(False) # Deactivate negative indexing.
|
||||
cdef uint64_t* buzhash64_init_table(bytes key):
|
||||
"""Initialize the buzhash table using the given key."""
|
||||
cdef int i
|
||||
"""
|
||||
Generate a balanced pseudo-random table deterministically from a 256-bit key.
|
||||
Balanced means that for each bit position 0..63, exactly 50% of the table values have the bit set to 1.
|
||||
"""
|
||||
# Create deterministic random number generator
|
||||
rng = random.Random(int.from_bytes(key, 'big'))
|
||||
|
||||
cdef int i, j, bit_pos
|
||||
cdef uint64_t* table = <uint64_t*>malloc(2048) # 256 * sizeof(uint64_t)
|
||||
|
||||
# Initialize all values to 0
|
||||
for i in range(256):
|
||||
# deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key:
|
||||
v = f"{i:02x}".encode() + key
|
||||
d64 = sha256(v).digest()[:8]
|
||||
table[i] = <uint64_t> int.from_bytes(d64, byteorder='little')
|
||||
table[i] = 0
|
||||
|
||||
# For each bit position, deterministically assign exactly 128 positions to have that bit set
|
||||
for bit_pos in range(64):
|
||||
# Create a list of indices and shuffle deterministically
|
||||
indices = list(range(256))
|
||||
rng.shuffle(indices)
|
||||
|
||||
# Set the bit at bit_pos for the first 128 shuffled indices
|
||||
for i in range(128):
|
||||
j = indices[i]
|
||||
table[j] |= (1ULL << bit_pos)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
|
|
@ -289,3 +306,14 @@ def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size
|
|||
sum = _buzhash64_update(sum, remove, add, len, table)
|
||||
free(table)
|
||||
return sum
|
||||
|
||||
|
||||
def buzhash64_get_table(bytes key):
|
||||
"""Get the buzhash table generated from <key>."""
|
||||
cdef uint64_t *table
|
||||
cdef int i
|
||||
table = buzhash64_init_table(key)
|
||||
try:
|
||||
return [table[i] for i in range(256)]
|
||||
finally:
|
||||
free(table)
|
||||
|
|
|
|||
|
|
@ -6,63 +6,69 @@ from io import BytesIO
|
|||
from ...chunkers import get_chunker
|
||||
from ...chunkers.buzhash64 import buzhash64, buzhash64_update, ChunkerBuzHash64
|
||||
from ...constants import * # NOQA
|
||||
from ...helpers import hex_to_bin
|
||||
from .. import BaseTestCase
|
||||
from . import cf
|
||||
|
||||
# from os.urandom(32)
|
||||
key0 = hex_to_bin("ad9f89095817f0566337dc9ee292fcd59b70f054a8200151f1df5f21704824da")
|
||||
key1 = hex_to_bin("f1088c7e9e6ae83557ad1558ff36c44a369ea719d1081c29684f52ffccb72cb8")
|
||||
key2 = hex_to_bin("57174a65fde67fe127b18430525b50a58406f1bd6cc629535208c7832e181067")
|
||||
|
||||
|
||||
class ChunkerBuzHash64TestCase(BaseTestCase):
|
||||
def test_chunkify64(self):
|
||||
data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y"
|
||||
parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
|
||||
parts = cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
|
||||
self.assert_equal(len(parts), 2)
|
||||
self.assert_equal(b"".join(parts), data)
|
||||
self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
|
||||
self.assert_equal(cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"],
|
||||
cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"],
|
||||
cf(ChunkerBuzHash64(key1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobar", b"boob", b"az", b"foobar", b"boob", b"az", b"foobar", b"boobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
|
||||
cf(ChunkerBuzHash64(key2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarb", b"oob", b"az", b"foobarb", b"oob", b"az", b"foobarb", b"oobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
cf(ChunkerBuzHash64(key0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarb", b"oobazf", b"oobarb", b"oobazf", b"oobarb", b"oobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(key1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"],
|
||||
cf(ChunkerBuzHash64(key2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
|
||||
cf(ChunkerBuzHash64(key0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarboobazf", b"oobarboobazf", b"oobarboobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"],
|
||||
cf(ChunkerBuzHash64(key1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbo", b"obazfoobarb", b"oobazfoobarb", b"oobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
|
||||
cf(ChunkerBuzHash64(key2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"],
|
||||
)
|
||||
|
||||
def test_buzhash64(self):
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478)
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910)
|
||||
expected = buzhash64(b"abcdefghijklmnop", b"1")
|
||||
previous = buzhash64(b"Xabcdefghijklmno", b"1")
|
||||
this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1")
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnop", key0), 15080163834872228739)
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnop", key1), 9505908538285923444)
|
||||
expected = buzhash64(b"abcdefghijklmnop", key0)
|
||||
previous = buzhash64(b"Xabcdefghijklmno", key0)
|
||||
this = buzhash64_update(previous, ord("X"), ord("p"), 16, key0)
|
||||
self.assert_equal(this, expected)
|
||||
# Test with more than 63 bytes to make sure our barrel_shift macro works correctly
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899)
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, key0), 1936382207158378368)
|
||||
|
||||
def test_small_reads64(self):
|
||||
class SmallReadFile:
|
||||
|
|
|
|||
|
|
@ -4,10 +4,16 @@ import os
|
|||
|
||||
from . import cf
|
||||
from ...chunkers import ChunkerBuzHash64
|
||||
from ...chunkers.buzhash64 import buzhash64_get_table
|
||||
from ...constants import * # NOQA
|
||||
from ...helpers import hex_to_bin
|
||||
|
||||
|
||||
# from os.urandom(32)
|
||||
key0 = hex_to_bin("ad9f89095817f0566337dc9ee292fcd59b70f054a8200151f1df5f21704824da")
|
||||
key1 = hex_to_bin("f1088c7e9e6ae83557ad1558ff36c44a369ea719d1081c29684f52ffccb72cb8")
|
||||
|
||||
|
||||
def H(data):
|
||||
return sha256(data).digest()
|
||||
|
||||
|
|
@ -39,7 +45,8 @@ def test_chunkpoints64_unchanged():
|
|||
# The "correct" hash below matches the existing chunker behavior.
|
||||
# Future chunker optimisations must not change this, or existing repos will bloat.
|
||||
overall_hash = H(b"".join(runs))
|
||||
assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc")
|
||||
print(overall_hash.hex())
|
||||
assert overall_hash == hex_to_bin("db4b37fbe0cb841d79cfbb52bff8ac2f11040bf83a7d389640c7afb314fc4bfb")
|
||||
|
||||
|
||||
def test_buzhash64_chunksize_distribution():
|
||||
|
|
@ -67,3 +74,27 @@ def test_buzhash64_chunksize_distribution():
|
|||
# most chunks should be cut due to buzhash triggering, not due to clipping at min/max size:
|
||||
assert min_count < 10
|
||||
assert max_count < 10
|
||||
|
||||
|
||||
def test_buzhash64_table():
|
||||
# Test that the function returns a list of 256 integers
|
||||
table0 = buzhash64_get_table(key0)
|
||||
assert len(table0) == 256
|
||||
|
||||
# Test that all elements are integers
|
||||
for value in table0:
|
||||
assert isinstance(value, int)
|
||||
|
||||
# Test that the function is deterministic (same key produces same table)
|
||||
table0_again = buzhash64_get_table(key0)
|
||||
assert table0 == table0_again
|
||||
|
||||
# Test that different keys produce different tables
|
||||
table1 = buzhash64_get_table(key1)
|
||||
assert table0 != table1
|
||||
|
||||
# Test that the table has balanced bit distribution
|
||||
# For each bit position 0..63, exactly 50% of the table values should have the bit set to 1
|
||||
for bit_pos in range(64):
|
||||
bit_count = sum(1 for value in table0 if value & (1 << bit_pos))
|
||||
assert bit_count == 128 # 50% of 256 = 128
|
||||
|
|
|
|||
Loading…
Reference in a new issue