Merge pull request #8903 from ThomasWaldmann/buzhash64

buzhash64 chunker
This commit is contained in:
TW 2025-06-11 08:31:27 +02:00 committed by GitHub
commit 9a65d5245d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 519 additions and 16 deletions

1
.gitignore vendored
View file

@ -7,6 +7,7 @@ src/borg/compress.c
src/borg/crypto/low_level.c
src/borg/item.c
src/borg/chunkers/buzhash.c
src/borg/chunkers/buzhash64.c
src/borg/chunkers/reader.c
src/borg/checksums.c
src/borg/platform/darwin.c

View file

@ -19,8 +19,8 @@ specified when the backup was performed.
Deduplication is performed globally across all data in the repository
(multiple backups and even multiple hosts), both on data and file
metadata, using :ref:`chunks` created by the chunker using the
Buzhash_ algorithm ("buzhash" chunker) or a simpler fixed blocksize
algorithm ("fixed" chunker).
Buzhash_ algorithm ("buzhash" and "buzhash64" chunker) or a simpler
fixed blocksize algorithm ("fixed" chunker).
To perform the repository-wide deduplication, a hash of each
chunk is checked against the :ref:`chunks cache <cache>`, which is a

View file

@ -399,6 +399,7 @@ Borg has these chunkers:
supporting a header block of different size.
- "buzhash": variable, content-defined blocksize, uses a rolling hash
computed by the Buzhash_ algorithm.
- "buzhash64": similar to "buzhash", but improved 64bit implementation
For some more general usage hints see also ``--chunker-params``.
@ -469,6 +470,16 @@ for the repository, and stored encrypted in the keyfile. This is to prevent
chunk size based fingerprinting attacks on your encrypted repo contents (to
guess what files you have based on a specific set of chunk sizes).
"buzhash64" chunker
+++++++++++++++++++
Similar to "buzhash", but using 64bit wide hash values.
The buzhash table is cryptographically derived from secret key material.
These changes should improve resistance against attacks and also solve
some of the issues of the original (32bit / XORed table) implementation.
.. _cache:
The cache

View file

@ -361,13 +361,19 @@ The chunks stored in the repo are the (compressed, encrypted and authenticated)
output of the chunker. The sizes of these stored chunks are influenced by the
compression, encryption and authentication.
buzhash chunker
~~~~~~~~~~~~~~~
buzhash and buzhash64 chunker
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The buzhash chunker chunks according to the input data, the chunker's
parameters and the secret chunker seed (which all influence the chunk boundary
The buzhash chunkers chunk according to the input data, the chunker's
parameters and secret key material (which all influence the chunk boundary
positions).
Secret key material:
- "buzhash": chunker seed (32bits), used for XORing the hardcoded buzhash table
- "buzhash64": bh64_key (256bits) is derived from ID key, used to cryptographically
generate the table.
Small files below some specific threshold (default: 512 KiB) result in only one
chunk (identical content / size as the original file), bigger files result in
multiple chunks.

View file

@ -543,6 +543,7 @@ cython_sources = """
src/borg/compress.pyx
src/borg/crypto/low_level.pyx
src/borg/chunkers/buzhash.pyx
src/borg/chunkers/buzhash64.pyx
src/borg/chunkers/reader.pyx
src/borg/hashindex.pyx
src/borg/item.pyx

View file

@ -51,6 +51,7 @@ cflags = ["-Wall", "-Wextra", "-Wpointer-arith", "-Wno-unreachable-code-fallthro
compress_source = "src/borg/compress.pyx"
crypto_ll_source = "src/borg/crypto/low_level.pyx"
buzhash_source = "src/borg/chunkers/buzhash.pyx"
buzhash64_source = "src/borg/chunkers/buzhash64.pyx"
reader_source = "src/borg/chunkers/reader.pyx"
hashindex_source = "src/borg/hashindex.pyx"
item_source = "src/borg/item.pyx"
@ -66,6 +67,7 @@ cython_sources = [
compress_source,
crypto_ll_source,
buzhash_source,
buzhash64_source,
reader_source,
hashindex_source,
item_source,
@ -185,6 +187,7 @@ if not on_rtd:
Extension("borg.hashindex", [hashindex_source], extra_compile_args=cflags),
Extension("borg.item", [item_source], extra_compile_args=cflags),
Extension("borg.chunkers.buzhash", [buzhash_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
Extension("borg.chunkers.buzhash64", [buzhash64_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
Extension("borg.chunkers.reader", [reader_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
Extension("borg.checksums", **checksums_ext_kwargs),
]

View file

@ -351,7 +351,7 @@ class ChunkBuffer:
self.packer = msgpack.Packer()
self.chunks = []
self.key = key
self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed, sparse=False)
self.chunker = get_chunker(*chunker_params, key=self.key, sparse=False)
self.saved_chunks_len = None
def add(self, item):
@ -1227,7 +1227,7 @@ class FilesystemObjectProcessors:
self.hlm = HardLinkManager(id_type=tuple, info_type=(list, type(None))) # (dev, ino) -> chunks or None
self.stats = Statistics(output_json=log_json, iec=iec) # threading: done by cache (including progress)
self.cwd = os.getcwd()
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse)
self.chunker = get_chunker(*chunker_params, key=key, sparse=sparse)
@contextmanager
def create_helper(self, path, st, status=None, hardlinkable=True, strip_prefix=None):
@ -1502,7 +1502,7 @@ class TarfileObjectProcessors:
self.print_file_status = file_status_printer or (lambda *args: None)
self.stats = Statistics(output_json=log_json, iec=iec) # threading: done by cache (including progress)
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=False)
self.chunker = get_chunker(*chunker_params, key=key, sparse=False)
self.hlm = HardLinkManager(id_type=str, info_type=list) # path -> chunks
@contextmanager
@ -2325,7 +2325,7 @@ class ArchiveRecreater:
target.process_file_chunks = ChunksProcessor(
cache=self.cache, key=self.key, add_item=target.add_item, rechunkify=target.recreate_rechunkify
).process_file_chunks
target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed, sparse=False)
target.chunker = get_chunker(*target.chunker_params, key=self.key, sparse=False)
return target
def create_target_archive(self, name):

View file

@ -146,7 +146,8 @@ class BenchmarkMixIn:
pass
for spec, func in [
("buzhash,19,23,21,4095", lambda: chunkit("buzhash", 19, 23, 21, 4095, seed=0, sparse=False)),
("buzhash,19,23,21,4095", lambda: chunkit("buzhash", 19, 23, 21, 4095, sparse=False)),
("buzhash64,19,23,21,4095", lambda: chunkit("buzhash64", 19, 23, 21, 4095, sparse=False)),
("fixed,1048576", lambda: chunkit("fixed", 1048576, sparse=False)),
]:
print(f"{spec:<24} {size:<10} {timeit(func, number=100):.3f}s")

View file

@ -41,7 +41,7 @@ def transfer_chunks(
file = ChunkIteratorFileWrapper(chunk_iterator)
# Create a chunker with the specified parameters
chunker = get_chunker(*chunker_params, seed=archive.key.chunk_seed, sparse=False)
chunker = get_chunker(*chunker_params, key=archive.key, sparse=False)
for chunk in chunker.chunkify(file):
if not dry_run:
chunk_id, data = cached_hash(chunk, archive.key.id_hash)

View file

@ -1,4 +1,5 @@
from .buzhash import Chunker
from .buzhash64 import ChunkerBuzHash64
from .failing import ChunkerFailing
from .fixed import ChunkerFixed
from .reader import * # noqa
@ -7,12 +8,17 @@ API_VERSION = "1.2_01"
def get_chunker(algo, *params, **kw):
key = kw.get("key", None)
sparse = kw.get("sparse", False)
# key.chunk_seed only has 32bits
seed = key.chunk_seed if key is not None else 0
# for buzhash64, we want a much longer key, so we derive it from the id key
bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b""
if algo == "buzhash":
seed = kw["seed"]
sparse = kw["sparse"]
return Chunker(seed, *params, sparse=sparse)
if algo == "buzhash64":
return ChunkerBuzHash64(bh64_key, *params, sparse=sparse)
if algo == "fixed":
sparse = kw["sparse"]
return ChunkerFixed(*params, sparse=sparse)
if algo == "fail":
return ChunkerFailing(*params)

View file

@ -0,0 +1,20 @@
from typing import List, Iterator, BinaryIO
from .reader import fmap_entry
API_VERSION: str
def buzhash64(data: bytes, key: bytes) -> int: ...
def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ...
class ChunkerBuzHash64:
def __init__(
self,
key: bytes,
chunk_min_exp: int,
chunk_max_exp: int,
hash_mask_bits: int,
hash_window_size: int,
sparse: bool = False,
) -> None: ...
def chunkify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...

View file

@ -0,0 +1,291 @@
# cython: language_level=3
API_VERSION = '1.2_01'
import cython
import time
from hashlib import sha256
from cpython.bytes cimport PyBytes_AsString
from libc.stdint cimport uint8_t, uint64_t
from libc.stdlib cimport malloc, free
from libc.string cimport memcpy, memmove
from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
from .reader import FileReader, Chunk
# Cyclic polynomial / buzhash
#
# https://en.wikipedia.org/wiki/Rolling_hash
#
# http://www.serve.net/buz/Notes.1st.year/HTML/C6/rand.012.html (by "BUZ", the inventor)
#
# http://www.dcs.gla.ac.uk/~hamer/cakes-talk.pdf (see buzhash slide)
#
# Some properties of buzhash / of this implementation:
#
# (1) the hash is designed for inputs <= 64 bytes, but the chunker uses it on a 4095 byte window;
# any repeating bytes at distance 64 within those 4095 bytes can cause cancellation within
# the hash function, e.g. in "X <any 63 bytes> X", the last X would cancel out the influence
# of the first X on the hash value.
# This seems to be the most reliable way to inline this code, using a C preprocessor macro:
cdef extern from *:
"""
#define BARREL_SHIFT64(v, shift) (((v) << (shift)) | ((v) >> (((64 - (shift)) & 0x3f))))
"""
uint64_t BARREL_SHIFT64(uint64_t v, uint64_t shift)
@cython.boundscheck(False) # Deactivate bounds checking
@cython.wraparound(False) # Deactivate negative indexing.
cdef uint64_t* buzhash64_init_table(bytes key):
"""Initialize the buzhash table using the given key."""
cdef int i
cdef uint64_t* table = <uint64_t*>malloc(2048) # 256 * sizeof(uint64_t)
for i in range(256):
# deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key:
v = f"{i:02x}".encode() + key
d64 = sha256(v).digest()[:8]
table[i] = <uint64_t> int.from_bytes(d64, byteorder='little')
return table
@cython.boundscheck(False) # Deactivate bounds checking
@cython.wraparound(False) # Deactivate negative indexing.
@cython.cdivision(True) # Use C division/modulo semantics for integer division.
cdef uint64_t _buzhash64(const unsigned char* data, size_t len, const uint64_t* h):
"""Calculate the buzhash of the given data."""
cdef uint64_t i
cdef uint64_t sum = 0, imod
for i in range(len - 1, 0, -1):
imod = i & 0x3f
sum ^= BARREL_SHIFT64(h[data[0]], imod)
data += 1
return sum ^ h[data[0]]
@cython.boundscheck(False) # Deactivate bounds checking
@cython.wraparound(False) # Deactivate negative indexing.
@cython.cdivision(True) # Use C division/modulo semantics for integer division.
cdef uint64_t _buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, const uint64_t* h):
"""Update the buzhash with a new byte."""
cdef uint64_t lenmod = len & 0x3f
return BARREL_SHIFT64(sum, 1) ^ BARREL_SHIFT64(h[remove], lenmod) ^ h[add]
cdef class ChunkerBuzHash64:
"""
Content-Defined Chunker, variable chunk sizes.
This chunker makes quite some effort to cut mostly chunks of the same-content, even if
the content moves to a different offset inside the file. It uses the buzhash
rolling-hash algorithm to identify the chunk cutting places by looking at the
content inside the moving window and computing the rolling hash value over the
window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
Additionally it obeys some more criteria, like a minimum and maximum chunk size.
It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
"""
cdef uint64_t chunk_mask
cdef uint64_t* table
cdef uint8_t* data
cdef object _fd # Python object for file descriptor
cdef int fh
cdef int done, eof
cdef size_t min_size, buf_size, window_size, remaining, position, last
cdef long long bytes_read, bytes_yielded # off_t in C, using long long for compatibility
cdef readonly float chunking_time
cdef object file_reader # FileReader instance
cdef size_t reader_block_size
cdef bint sparse
def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
min_size = 1 << chunk_min_exp
max_size = 1 << chunk_max_exp
assert max_size <= len(zeros)
# see chunker_process, first while loop condition, first term must be able to get True:
assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
self.window_size = hash_window_size
self.chunk_mask = (1 << hash_mask_bits) - 1
self.min_size = min_size
self.table = buzhash64_init_table(key)
self.buf_size = max_size
self.data = <uint8_t*>malloc(self.buf_size)
self.fh = -1
self.done = 0
self.eof = 0
self.remaining = 0
self.position = 0
self.last = 0
self.bytes_read = 0
self.bytes_yielded = 0
self._fd = None
self.chunking_time = 0.0
self.reader_block_size = 1024 * 1024
self.sparse = sparse
def __dealloc__(self):
"""Free the chunker's resources."""
if self.table != NULL:
free(self.table)
self.table = NULL
if self.data != NULL:
free(self.data)
self.data = NULL
cdef int fill(self) except 0:
"""Fill the chunker's buffer with more data."""
cdef ssize_t n
cdef object chunk
# Move remaining data to the beginning of the buffer
memmove(self.data, self.data + self.last, self.position + self.remaining - self.last)
self.position -= self.last
self.last = 0
n = self.buf_size - self.position - self.remaining
if self.eof or n == 0:
return 1
# Use FileReader to read data
chunk = self.file_reader.read(n)
n = chunk.meta["size"]
if n > 0:
# Only copy data if it's not a hole
if chunk.meta["allocation"] == CH_DATA:
# Copy data from chunk to our buffer
memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(chunk.data), n)
else:
# For holes, fill with zeros
memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(zeros[:n]), n)
self.remaining += n
self.bytes_read += n
else:
self.eof = 1
return 1
cdef object process(self) except *:
"""Process the chunker's buffer and return the next chunk."""
cdef uint64_t sum, chunk_mask = self.chunk_mask
cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size
cdef uint8_t* p
cdef uint8_t* stop_at
cdef size_t did_bytes
if self.done:
if self.bytes_read == self.bytes_yielded:
raise StopIteration
else:
raise Exception("chunkifier byte count mismatch")
while self.remaining < min_size + window_size + 1 and not self.eof: # see assert in Chunker init
if not self.fill():
return None
# Here we either are at eof...
if self.eof:
self.done = 1
if self.remaining:
self.bytes_yielded += self.remaining
# Return a memory view of the remaining data
return memoryview((self.data + self.position)[:self.remaining])
else:
if self.bytes_read == self.bytes_yielded:
raise StopIteration
else:
raise Exception("chunkifier byte count mismatch")
# ... or we have at least min_size + window_size + 1 bytes remaining.
# We do not want to "cut" a chunk smaller than min_size and the hash
# window starts at the potential cutting place.
self.position += min_size
self.remaining -= min_size
sum = _buzhash64(self.data + self.position, window_size, self.table)
while self.remaining > self.window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size):
p = self.data + self.position
stop_at = p + self.remaining - window_size
while p < stop_at and (sum & chunk_mask):
sum = _buzhash64_update(sum, p[0], p[window_size], window_size, self.table)
p += 1
did_bytes = p - (self.data + self.position)
self.position += did_bytes
self.remaining -= did_bytes
if self.remaining <= window_size:
if not self.fill():
return None
if self.remaining <= window_size:
self.position += self.remaining
self.remaining = 0
old_last = self.last
self.last = self.position
n = self.last - old_last
self.bytes_yielded += n
# Return a memory view of the chunk
return memoryview((self.data + old_last)[:n])
def chunkify(self, fd, fh=-1, fmap=None):
"""
Cut a file into chunks.
:param fd: Python file object
:param fh: OS-level file handle (if available),
defaults to -1 which means not to use OS-level fd.
:param fmap: a file map, same format as generated by sparsemap
"""
self._fd = fd
self.fh = fh
self.file_reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size, sparse=self.sparse, fmap=fmap)
self.done = 0
self.remaining = 0
self.bytes_read = 0
self.bytes_yielded = 0
self.position = 0
self.last = 0
self.eof = 0
return self
def __iter__(self):
return self
def __next__(self):
started_chunking = time.monotonic()
data = self.process()
got = len(data)
# we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code,
# but we can just check if data was all-zero (and either came from a hole
# or from stored zeros - we can not detect that here).
if zeros.startswith(data):
data = None
allocation = CH_ALLOC
else:
allocation = CH_DATA
self.chunking_time += time.monotonic() - started_chunking
return Chunk(data, size=got, allocation=allocation)
def buzhash64(data, bytes key):
cdef uint64_t *table
cdef uint64_t sum
table = buzhash64_init_table(key)
sum = _buzhash64(<const unsigned char *> data, len(data), table)
free(table)
return sum
def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, bytes key):
cdef uint64_t *table
table = buzhash64_init_table(key)
sum = _buzhash64_update(sum, remove, add, len, table)
free(table)
return sum

View file

@ -92,6 +92,7 @@ MAX_SEGMENT_DIR_INDEX = 2**32 - 1
# chunker algorithms
CH_BUZHASH = "buzhash"
CH_BUZHASH64 = "buzhash64"
CH_FIXED = "fixed"
CH_FAIL = "fail"
@ -103,6 +104,7 @@ HASH_MASK_BITS = 21 # results in ~2MiB chunks statistically
# defaults, use --chunker-params to override
CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
# chunker params for the items metadata stream, finer granularity
ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE)

View file

@ -187,6 +187,21 @@ def ChunkerParams(s):
return algo, block_size, header_size
if algo == "default" and count == 1: # default
return CHUNKER_PARAMS
if algo == CH_BUZHASH64 and count == 5: # buzhash64, chunk_min, chunk_max, chunk_mask, window_size
chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[1:])
if not (chunk_min <= chunk_mask <= chunk_max):
raise argparse.ArgumentTypeError("required: chunk_min <= chunk_mask <= chunk_max")
if chunk_min < 6:
# see comment in 'fixed' algo check
raise argparse.ArgumentTypeError(
"min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)"
)
if chunk_max > 23:
raise argparse.ArgumentTypeError(
"max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)"
)
# note that for buzhash64, there is no problem with even window_size.
return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size
# this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
if algo == CH_BUZHASH and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4 :])

View file

@ -0,0 +1,77 @@
# Note: these tests are part of the self test, do not use or import pytest functionality here.
# See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
from io import BytesIO
from ...chunkers import get_chunker
from ...chunkers.buzhash64 import buzhash64, buzhash64_update, ChunkerBuzHash64
from ...constants import * # NOQA
from .. import BaseTestCase
from . import cf
class ChunkerBuzHash64TestCase(BaseTestCase):
def test_chunkify64(self):
data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y"
parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
self.assert_equal(len(parts), 2)
self.assert_equal(b"".join(parts), data)
self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
self.assert_equal(
cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
)
self.assert_equal(
cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
)
def test_buzhash64(self):
self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478)
self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910)
expected = buzhash64(b"abcdefghijklmnop", b"1")
previous = buzhash64(b"Xabcdefghijklmno", b"1")
this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1")
self.assert_equal(this, expected)
# Test with more than 63 bytes to make sure our barrel_shift macro works correctly
self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899)
def test_small_reads64(self):
class SmallReadFile:
input = b"a" * (20 + 1)
def read(self, nbytes):
self.input = self.input[:-1]
return self.input[:1]
chunker = get_chunker(*CHUNKER64_PARAMS, sparse=False)
reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile())))
assert reconstructed == b"a" * 20

View file

@ -0,0 +1,69 @@
from hashlib import sha256
from io import BytesIO
import os
from . import cf
from ...chunkers import ChunkerBuzHash64
from ...constants import * # NOQA
from ...helpers import hex_to_bin
def H(data):
return sha256(data).digest()
def test_chunkpoints64_unchanged():
def twist(size):
x = 1
a = bytearray(size)
for i in range(size):
x = (x * 1103515245 + 12345) & 0x7FFFFFFF
a[i] = x & 0xFF
return a
data = twist(100000)
runs = []
for winsize in (65, 129, HASH_WINDOW_SIZE, 7351):
for minexp in (4, 6, 7, 11, 12):
for maxexp in (15, 17):
if minexp >= maxexp:
continue
for maskbits in (4, 7, 10, 12):
for key in (b"first_key", b"second_key"):
fh = BytesIO(data)
chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize)
chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
runs.append(H(b"".join(chunks)))
# The "correct" hash below matches the existing chunker behavior.
# Future chunker optimisations must not change this, or existing repos will bloat.
overall_hash = H(b"".join(runs))
assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc")
def test_buzhash64_chunksize_distribution():
data = os.urandom(1048576)
min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB
chunker = ChunkerBuzHash64(b"", min_exp, max_exp, mask, 4095)
f = BytesIO(data)
chunks = cf(chunker.chunkify(f))
del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp
chunk_sizes = [len(chunk) for chunk in chunks]
chunks_count = len(chunks)
min_chunksize_observed = min(chunk_sizes)
max_chunksize_observed = max(chunk_sizes)
min_count = sum(int(size == 2**min_exp) for size in chunk_sizes)
max_count = sum(int(size == 2**max_exp) for size in chunk_sizes)
print(
f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} "
f"min count: {min_count} max count: {max_count}"
)
# usually there will about 64 chunks
assert 32 < chunks_count < 128
# chunks always must be between min and max (clipping must work):
assert min_chunksize_observed >= 2**min_exp
assert max_chunksize_observed <= 2**max_exp
# most chunks should be cut due to buzhash triggering, not due to clipping at min/max size:
assert min_count < 10
assert max_count < 10

View file

@ -69,6 +69,6 @@ class ChunkerTestCase(BaseTestCase):
self.input = self.input[:-1]
return self.input[:1]
chunker = get_chunker(*CHUNKER_PARAMS, seed=0, sparse=False)
chunker = get_chunker(*CHUNKER_PARAMS, sparse=False)
reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile())))
assert reconstructed == b"a" * 20