diff --git a/.gitignore b/.gitignore index 028febb19..13717d20a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ src/borg/compress.c src/borg/crypto/low_level.c src/borg/item.c src/borg/chunkers/buzhash.c +src/borg/chunkers/buzhash64.c src/borg/chunkers/reader.c src/borg/checksums.c src/borg/platform/darwin.c diff --git a/docs/internals.rst b/docs/internals.rst index e587803cb..3c6645c19 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -19,8 +19,8 @@ specified when the backup was performed. Deduplication is performed globally across all data in the repository (multiple backups and even multiple hosts), both on data and file metadata, using :ref:`chunks` created by the chunker using the -Buzhash_ algorithm ("buzhash" chunker) or a simpler fixed blocksize -algorithm ("fixed" chunker). +Buzhash_ algorithm ("buzhash" and "buzhash64" chunker) or a simpler +fixed blocksize algorithm ("fixed" chunker). To perform the repository-wide deduplication, a hash of each chunk is checked against the :ref:`chunks cache `, which is a diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst index ff1136a60..b7ffccc36 100644 --- a/docs/internals/data-structures.rst +++ b/docs/internals/data-structures.rst @@ -399,6 +399,7 @@ Borg has these chunkers: supporting a header block of different size. - "buzhash": variable, content-defined blocksize, uses a rolling hash computed by the Buzhash_ algorithm. +- "buzhash64": similar to "buzhash", but improved 64bit implementation For some more general usage hints see also ``--chunker-params``. @@ -469,6 +470,16 @@ for the repository, and stored encrypted in the keyfile. This is to prevent chunk size based fingerprinting attacks on your encrypted repo contents (to guess what files you have based on a specific set of chunk sizes). +"buzhash64" chunker ++++++++++++++++++++ + +Similar to "buzhash", but using 64bit wide hash values. + +The buzhash table is cryptographically derived from secret key material. + +These changes should improve resistance against attacks and also solve +some of the issues of the original (32bit / XORed table) implementation. + .. _cache: The cache diff --git a/docs/internals/security.rst b/docs/internals/security.rst index 40b27d797..bcddbb2e8 100644 --- a/docs/internals/security.rst +++ b/docs/internals/security.rst @@ -361,13 +361,19 @@ The chunks stored in the repo are the (compressed, encrypted and authenticated) output of the chunker. The sizes of these stored chunks are influenced by the compression, encryption and authentication. -buzhash chunker -~~~~~~~~~~~~~~~ +buzhash and buzhash64 chunker +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The buzhash chunker chunks according to the input data, the chunker's -parameters and the secret chunker seed (which all influence the chunk boundary +The buzhash chunkers chunk according to the input data, the chunker's +parameters and secret key material (which all influence the chunk boundary positions). +Secret key material: + +- "buzhash": chunker seed (32bits), used for XORing the hardcoded buzhash table +- "buzhash64": bh64_key (256bits) is derived from ID key, used to cryptographically + generate the table. + Small files below some specific threshold (default: 512 KiB) result in only one chunk (identical content / size as the original file), bigger files result in multiple chunks. diff --git a/scripts/make.py b/scripts/make.py index 05b4072de..0a64493ca 100644 --- a/scripts/make.py +++ b/scripts/make.py @@ -543,6 +543,7 @@ cython_sources = """ src/borg/compress.pyx src/borg/crypto/low_level.pyx src/borg/chunkers/buzhash.pyx +src/borg/chunkers/buzhash64.pyx src/borg/chunkers/reader.pyx src/borg/hashindex.pyx src/borg/item.pyx diff --git a/setup.py b/setup.py index 19f403583..859d34690 100644 --- a/setup.py +++ b/setup.py @@ -51,6 +51,7 @@ cflags = ["-Wall", "-Wextra", "-Wpointer-arith", "-Wno-unreachable-code-fallthro compress_source = "src/borg/compress.pyx" crypto_ll_source = "src/borg/crypto/low_level.pyx" buzhash_source = "src/borg/chunkers/buzhash.pyx" +buzhash64_source = "src/borg/chunkers/buzhash64.pyx" reader_source = "src/borg/chunkers/reader.pyx" hashindex_source = "src/borg/hashindex.pyx" item_source = "src/borg/item.pyx" @@ -66,6 +67,7 @@ cython_sources = [ compress_source, crypto_ll_source, buzhash_source, + buzhash64_source, reader_source, hashindex_source, item_source, @@ -185,6 +187,7 @@ if not on_rtd: Extension("borg.hashindex", [hashindex_source], extra_compile_args=cflags), Extension("borg.item", [item_source], extra_compile_args=cflags), Extension("borg.chunkers.buzhash", [buzhash_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]), + Extension("borg.chunkers.buzhash64", [buzhash64_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]), Extension("borg.chunkers.reader", [reader_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]), Extension("borg.checksums", **checksums_ext_kwargs), ] diff --git a/src/borg/archive.py b/src/borg/archive.py index 462da3136..5bf8faaec 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -351,7 +351,7 @@ class ChunkBuffer: self.packer = msgpack.Packer() self.chunks = [] self.key = key - self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed, sparse=False) + self.chunker = get_chunker(*chunker_params, key=self.key, sparse=False) self.saved_chunks_len = None def add(self, item): @@ -1227,7 +1227,7 @@ class FilesystemObjectProcessors: self.hlm = HardLinkManager(id_type=tuple, info_type=(list, type(None))) # (dev, ino) -> chunks or None self.stats = Statistics(output_json=log_json, iec=iec) # threading: done by cache (including progress) self.cwd = os.getcwd() - self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse) + self.chunker = get_chunker(*chunker_params, key=key, sparse=sparse) @contextmanager def create_helper(self, path, st, status=None, hardlinkable=True, strip_prefix=None): @@ -1502,7 +1502,7 @@ class TarfileObjectProcessors: self.print_file_status = file_status_printer or (lambda *args: None) self.stats = Statistics(output_json=log_json, iec=iec) # threading: done by cache (including progress) - self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=False) + self.chunker = get_chunker(*chunker_params, key=key, sparse=False) self.hlm = HardLinkManager(id_type=str, info_type=list) # path -> chunks @contextmanager @@ -2325,7 +2325,7 @@ class ArchiveRecreater: target.process_file_chunks = ChunksProcessor( cache=self.cache, key=self.key, add_item=target.add_item, rechunkify=target.recreate_rechunkify ).process_file_chunks - target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed, sparse=False) + target.chunker = get_chunker(*target.chunker_params, key=self.key, sparse=False) return target def create_target_archive(self, name): diff --git a/src/borg/archiver/benchmark_cmd.py b/src/borg/archiver/benchmark_cmd.py index e69aa2c18..2818435f1 100644 --- a/src/borg/archiver/benchmark_cmd.py +++ b/src/borg/archiver/benchmark_cmd.py @@ -146,7 +146,8 @@ class BenchmarkMixIn: pass for spec, func in [ - ("buzhash,19,23,21,4095", lambda: chunkit("buzhash", 19, 23, 21, 4095, seed=0, sparse=False)), + ("buzhash,19,23,21,4095", lambda: chunkit("buzhash", 19, 23, 21, 4095, sparse=False)), + ("buzhash64,19,23,21,4095", lambda: chunkit("buzhash64", 19, 23, 21, 4095, sparse=False)), ("fixed,1048576", lambda: chunkit("fixed", 1048576, sparse=False)), ]: print(f"{spec:<24} {size:<10} {timeit(func, number=100):.3f}s") diff --git a/src/borg/archiver/transfer_cmd.py b/src/borg/archiver/transfer_cmd.py index 617c8abec..4ada0b848 100644 --- a/src/borg/archiver/transfer_cmd.py +++ b/src/borg/archiver/transfer_cmd.py @@ -41,7 +41,7 @@ def transfer_chunks( file = ChunkIteratorFileWrapper(chunk_iterator) # Create a chunker with the specified parameters - chunker = get_chunker(*chunker_params, seed=archive.key.chunk_seed, sparse=False) + chunker = get_chunker(*chunker_params, key=archive.key, sparse=False) for chunk in chunker.chunkify(file): if not dry_run: chunk_id, data = cached_hash(chunk, archive.key.id_hash) diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py index 7f7833b8c..463be44c8 100644 --- a/src/borg/chunkers/__init__.py +++ b/src/borg/chunkers/__init__.py @@ -1,4 +1,5 @@ from .buzhash import Chunker +from .buzhash64 import ChunkerBuzHash64 from .failing import ChunkerFailing from .fixed import ChunkerFixed from .reader import * # noqa @@ -7,12 +8,17 @@ API_VERSION = "1.2_01" def get_chunker(algo, *params, **kw): + key = kw.get("key", None) + sparse = kw.get("sparse", False) + # key.chunk_seed only has 32bits + seed = key.chunk_seed if key is not None else 0 + # for buzhash64, we want a much longer key, so we derive it from the id key + bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"" if algo == "buzhash": - seed = kw["seed"] - sparse = kw["sparse"] return Chunker(seed, *params, sparse=sparse) + if algo == "buzhash64": + return ChunkerBuzHash64(bh64_key, *params, sparse=sparse) if algo == "fixed": - sparse = kw["sparse"] return ChunkerFixed(*params, sparse=sparse) if algo == "fail": return ChunkerFailing(*params) diff --git a/src/borg/chunkers/buzhash64.pyi b/src/borg/chunkers/buzhash64.pyi new file mode 100644 index 000000000..3414bd609 --- /dev/null +++ b/src/borg/chunkers/buzhash64.pyi @@ -0,0 +1,20 @@ +from typing import List, Iterator, BinaryIO + +from .reader import fmap_entry + +API_VERSION: str + +def buzhash64(data: bytes, key: bytes) -> int: ... +def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ... + +class ChunkerBuzHash64: + def __init__( + self, + key: bytes, + chunk_min_exp: int, + chunk_max_exp: int, + hash_mask_bits: int, + hash_window_size: int, + sparse: bool = False, + ) -> None: ... + def chunkify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ... diff --git a/src/borg/chunkers/buzhash64.pyx b/src/borg/chunkers/buzhash64.pyx new file mode 100644 index 000000000..0199406fe --- /dev/null +++ b/src/borg/chunkers/buzhash64.pyx @@ -0,0 +1,291 @@ +# cython: language_level=3 + +API_VERSION = '1.2_01' + +import cython +import time +from hashlib import sha256 + +from cpython.bytes cimport PyBytes_AsString +from libc.stdint cimport uint8_t, uint64_t +from libc.stdlib cimport malloc, free +from libc.string cimport memcpy, memmove + +from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros +from .reader import FileReader, Chunk + +# Cyclic polynomial / buzhash +# +# https://en.wikipedia.org/wiki/Rolling_hash +# +# http://www.serve.net/buz/Notes.1st.year/HTML/C6/rand.012.html (by "BUZ", the inventor) +# +# http://www.dcs.gla.ac.uk/~hamer/cakes-talk.pdf (see buzhash slide) +# +# Some properties of buzhash / of this implementation: +# +# (1) the hash is designed for inputs <= 64 bytes, but the chunker uses it on a 4095 byte window; +# any repeating bytes at distance 64 within those 4095 bytes can cause cancellation within +# the hash function, e.g. in "X X", the last X would cancel out the influence +# of the first X on the hash value. + +# This seems to be the most reliable way to inline this code, using a C preprocessor macro: +cdef extern from *: + """ + #define BARREL_SHIFT64(v, shift) (((v) << (shift)) | ((v) >> (((64 - (shift)) & 0x3f)))) + """ + uint64_t BARREL_SHIFT64(uint64_t v, uint64_t shift) + + +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +cdef uint64_t* buzhash64_init_table(bytes key): + """Initialize the buzhash table using the given key.""" + cdef int i + cdef uint64_t* table = malloc(2048) # 256 * sizeof(uint64_t) + for i in range(256): + # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key: + v = f"{i:02x}".encode() + key + d64 = sha256(v).digest()[:8] + table[i] = int.from_bytes(d64, byteorder='little') + return table + + +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +@cython.cdivision(True) # Use C division/modulo semantics for integer division. +cdef uint64_t _buzhash64(const unsigned char* data, size_t len, const uint64_t* h): + """Calculate the buzhash of the given data.""" + cdef uint64_t i + cdef uint64_t sum = 0, imod + for i in range(len - 1, 0, -1): + imod = i & 0x3f + sum ^= BARREL_SHIFT64(h[data[0]], imod) + data += 1 + return sum ^ h[data[0]] + + +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +@cython.cdivision(True) # Use C division/modulo semantics for integer division. +cdef uint64_t _buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, const uint64_t* h): + """Update the buzhash with a new byte.""" + cdef uint64_t lenmod = len & 0x3f + return BARREL_SHIFT64(sum, 1) ^ BARREL_SHIFT64(h[remove], lenmod) ^ h[add] + + +cdef class ChunkerBuzHash64: + """ + Content-Defined Chunker, variable chunk sizes. + + This chunker makes quite some effort to cut mostly chunks of the same-content, even if + the content moves to a different offset inside the file. It uses the buzhash + rolling-hash algorithm to identify the chunk cutting places by looking at the + content inside the moving window and computing the rolling hash value over the + window contents. If the last n bits of the rolling hash are 0, a chunk is cut. + Additionally it obeys some more criteria, like a minimum and maximum chunk size. + It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks. + """ + cdef uint64_t chunk_mask + cdef uint64_t* table + cdef uint8_t* data + cdef object _fd # Python object for file descriptor + cdef int fh + cdef int done, eof + cdef size_t min_size, buf_size, window_size, remaining, position, last + cdef long long bytes_read, bytes_yielded # off_t in C, using long long for compatibility + cdef readonly float chunking_time + cdef object file_reader # FileReader instance + cdef size_t reader_block_size + cdef bint sparse + + def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False): + min_size = 1 << chunk_min_exp + max_size = 1 << chunk_max_exp + assert max_size <= len(zeros) + # see chunker_process, first while loop condition, first term must be able to get True: + assert hash_window_size + min_size + 1 <= max_size, "too small max_size" + + self.window_size = hash_window_size + self.chunk_mask = (1 << hash_mask_bits) - 1 + self.min_size = min_size + self.table = buzhash64_init_table(key) + self.buf_size = max_size + self.data = malloc(self.buf_size) + self.fh = -1 + self.done = 0 + self.eof = 0 + self.remaining = 0 + self.position = 0 + self.last = 0 + self.bytes_read = 0 + self.bytes_yielded = 0 + self._fd = None + self.chunking_time = 0.0 + self.reader_block_size = 1024 * 1024 + self.sparse = sparse + + def __dealloc__(self): + """Free the chunker's resources.""" + if self.table != NULL: + free(self.table) + self.table = NULL + if self.data != NULL: + free(self.data) + self.data = NULL + + cdef int fill(self) except 0: + """Fill the chunker's buffer with more data.""" + cdef ssize_t n + cdef object chunk + + # Move remaining data to the beginning of the buffer + memmove(self.data, self.data + self.last, self.position + self.remaining - self.last) + self.position -= self.last + self.last = 0 + n = self.buf_size - self.position - self.remaining + + if self.eof or n == 0: + return 1 + + # Use FileReader to read data + chunk = self.file_reader.read(n) + n = chunk.meta["size"] + + if n > 0: + # Only copy data if it's not a hole + if chunk.meta["allocation"] == CH_DATA: + # Copy data from chunk to our buffer + memcpy(self.data + self.position + self.remaining, PyBytes_AsString(chunk.data), n) + else: + # For holes, fill with zeros + memcpy(self.data + self.position + self.remaining, PyBytes_AsString(zeros[:n]), n) + + self.remaining += n + self.bytes_read += n + else: + self.eof = 1 + + return 1 + + cdef object process(self) except *: + """Process the chunker's buffer and return the next chunk.""" + cdef uint64_t sum, chunk_mask = self.chunk_mask + cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size + cdef uint8_t* p + cdef uint8_t* stop_at + cdef size_t did_bytes + + if self.done: + if self.bytes_read == self.bytes_yielded: + raise StopIteration + else: + raise Exception("chunkifier byte count mismatch") + + while self.remaining < min_size + window_size + 1 and not self.eof: # see assert in Chunker init + if not self.fill(): + return None + + # Here we either are at eof... + if self.eof: + self.done = 1 + if self.remaining: + self.bytes_yielded += self.remaining + # Return a memory view of the remaining data + return memoryview((self.data + self.position)[:self.remaining]) + else: + if self.bytes_read == self.bytes_yielded: + raise StopIteration + else: + raise Exception("chunkifier byte count mismatch") + + # ... or we have at least min_size + window_size + 1 bytes remaining. + # We do not want to "cut" a chunk smaller than min_size and the hash + # window starts at the potential cutting place. + self.position += min_size + self.remaining -= min_size + sum = _buzhash64(self.data + self.position, window_size, self.table) + + while self.remaining > self.window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size): + p = self.data + self.position + stop_at = p + self.remaining - window_size + + while p < stop_at and (sum & chunk_mask): + sum = _buzhash64_update(sum, p[0], p[window_size], window_size, self.table) + p += 1 + + did_bytes = p - (self.data + self.position) + self.position += did_bytes + self.remaining -= did_bytes + + if self.remaining <= window_size: + if not self.fill(): + return None + + if self.remaining <= window_size: + self.position += self.remaining + self.remaining = 0 + + old_last = self.last + self.last = self.position + n = self.last - old_last + self.bytes_yielded += n + + # Return a memory view of the chunk + return memoryview((self.data + old_last)[:n]) + + def chunkify(self, fd, fh=-1, fmap=None): + """ + Cut a file into chunks. + + :param fd: Python file object + :param fh: OS-level file handle (if available), + defaults to -1 which means not to use OS-level fd. + :param fmap: a file map, same format as generated by sparsemap + """ + self._fd = fd + self.fh = fh + self.file_reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size, sparse=self.sparse, fmap=fmap) + self.done = 0 + self.remaining = 0 + self.bytes_read = 0 + self.bytes_yielded = 0 + self.position = 0 + self.last = 0 + self.eof = 0 + return self + + def __iter__(self): + return self + + def __next__(self): + started_chunking = time.monotonic() + data = self.process() + got = len(data) + # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code, + # but we can just check if data was all-zero (and either came from a hole + # or from stored zeros - we can not detect that here). + if zeros.startswith(data): + data = None + allocation = CH_ALLOC + else: + allocation = CH_DATA + self.chunking_time += time.monotonic() - started_chunking + return Chunk(data, size=got, allocation=allocation) + + +def buzhash64(data, bytes key): + cdef uint64_t *table + cdef uint64_t sum + table = buzhash64_init_table(key) + sum = _buzhash64( data, len(data), table) + free(table) + return sum + + +def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, bytes key): + cdef uint64_t *table + table = buzhash64_init_table(key) + sum = _buzhash64_update(sum, remove, add, len, table) + free(table) + return sum diff --git a/src/borg/constants.py b/src/borg/constants.py index 911a8f1be..8bad4b6b4 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -92,6 +92,7 @@ MAX_SEGMENT_DIR_INDEX = 2**32 - 1 # chunker algorithms CH_BUZHASH = "buzhash" +CH_BUZHASH64 = "buzhash64" CH_FIXED = "fixed" CH_FAIL = "fail" @@ -103,6 +104,7 @@ HASH_MASK_BITS = 21 # results in ~2MiB chunks statistically # defaults, use --chunker-params to override CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) +CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) # chunker params for the items metadata stream, finer granularity ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index cb481685a..c98d16f79 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -187,6 +187,21 @@ def ChunkerParams(s): return algo, block_size, header_size if algo == "default" and count == 1: # default return CHUNKER_PARAMS + if algo == CH_BUZHASH64 and count == 5: # buzhash64, chunk_min, chunk_max, chunk_mask, window_size + chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[1:]) + if not (chunk_min <= chunk_mask <= chunk_max): + raise argparse.ArgumentTypeError("required: chunk_min <= chunk_mask <= chunk_max") + if chunk_min < 6: + # see comment in 'fixed' algo check + raise argparse.ArgumentTypeError( + "min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)" + ) + if chunk_max > 23: + raise argparse.ArgumentTypeError( + "max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)" + ) + # note that for buzhash64, there is no problem with even window_size. + return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash): if algo == CH_BUZHASH and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4 :]) diff --git a/src/borg/testsuite/chunkers/buzhash64_self_test.py b/src/borg/testsuite/chunkers/buzhash64_self_test.py new file mode 100644 index 000000000..41198477d --- /dev/null +++ b/src/borg/testsuite/chunkers/buzhash64_self_test.py @@ -0,0 +1,77 @@ +# Note: these tests are part of the self test, do not use or import pytest functionality here. +# See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT + +from io import BytesIO + +from ...chunkers import get_chunker +from ...chunkers.buzhash64 import buzhash64, buzhash64_update, ChunkerBuzHash64 +from ...constants import * # NOQA +from .. import BaseTestCase +from . import cf + + +class ChunkerBuzHash64TestCase(BaseTestCase): + def test_chunkify64(self): + data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y" + parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))) + self.assert_equal(len(parts), 2) + self.assert_equal(b"".join(parts), data) + self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), []) + self.assert_equal( + cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"], + ) + self.assert_equal( + cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"], + ) + self.assert_equal( + cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"], + ) + self.assert_equal( + cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"], + ) + self.assert_equal( + cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"], + ) + self.assert_equal( + cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"], + ) + self.assert_equal( + cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"], + ) + self.assert_equal( + cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"], + ) + self.assert_equal( + cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"], + ) + + def test_buzhash64(self): + self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478) + self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910) + expected = buzhash64(b"abcdefghijklmnop", b"1") + previous = buzhash64(b"Xabcdefghijklmno", b"1") + this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1") + self.assert_equal(this, expected) + # Test with more than 63 bytes to make sure our barrel_shift macro works correctly + self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899) + + def test_small_reads64(self): + class SmallReadFile: + input = b"a" * (20 + 1) + + def read(self, nbytes): + self.input = self.input[:-1] + return self.input[:1] + + chunker = get_chunker(*CHUNKER64_PARAMS, sparse=False) + reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile()))) + assert reconstructed == b"a" * 20 diff --git a/src/borg/testsuite/chunkers/buzhash64_test.py b/src/borg/testsuite/chunkers/buzhash64_test.py new file mode 100644 index 000000000..7a0019732 --- /dev/null +++ b/src/borg/testsuite/chunkers/buzhash64_test.py @@ -0,0 +1,69 @@ +from hashlib import sha256 +from io import BytesIO +import os + +from . import cf +from ...chunkers import ChunkerBuzHash64 +from ...constants import * # NOQA +from ...helpers import hex_to_bin + + +def H(data): + return sha256(data).digest() + + +def test_chunkpoints64_unchanged(): + def twist(size): + x = 1 + a = bytearray(size) + for i in range(size): + x = (x * 1103515245 + 12345) & 0x7FFFFFFF + a[i] = x & 0xFF + return a + + data = twist(100000) + + runs = [] + for winsize in (65, 129, HASH_WINDOW_SIZE, 7351): + for minexp in (4, 6, 7, 11, 12): + for maxexp in (15, 17): + if minexp >= maxexp: + continue + for maskbits in (4, 7, 10, 12): + for key in (b"first_key", b"second_key"): + fh = BytesIO(data) + chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize) + chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))] + runs.append(H(b"".join(chunks))) + + # The "correct" hash below matches the existing chunker behavior. + # Future chunker optimisations must not change this, or existing repos will bloat. + overall_hash = H(b"".join(runs)) + assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc") + + +def test_buzhash64_chunksize_distribution(): + data = os.urandom(1048576) + min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB + chunker = ChunkerBuzHash64(b"", min_exp, max_exp, mask, 4095) + f = BytesIO(data) + chunks = cf(chunker.chunkify(f)) + del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp + chunk_sizes = [len(chunk) for chunk in chunks] + chunks_count = len(chunks) + min_chunksize_observed = min(chunk_sizes) + max_chunksize_observed = max(chunk_sizes) + min_count = sum(int(size == 2**min_exp) for size in chunk_sizes) + max_count = sum(int(size == 2**max_exp) for size in chunk_sizes) + print( + f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} " + f"min count: {min_count} max count: {max_count}" + ) + # usually there will about 64 chunks + assert 32 < chunks_count < 128 + # chunks always must be between min and max (clipping must work): + assert min_chunksize_observed >= 2**min_exp + assert max_chunksize_observed <= 2**max_exp + # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size: + assert min_count < 10 + assert max_count < 10 diff --git a/src/borg/testsuite/chunkers/buzhash_self_test.py b/src/borg/testsuite/chunkers/buzhash_self_test.py index 1c6337047..9baf862f3 100644 --- a/src/borg/testsuite/chunkers/buzhash_self_test.py +++ b/src/borg/testsuite/chunkers/buzhash_self_test.py @@ -69,6 +69,6 @@ class ChunkerTestCase(BaseTestCase): self.input = self.input[:-1] return self.input[:1] - chunker = get_chunker(*CHUNKER_PARAMS, seed=0, sparse=False) + chunker = get_chunker(*CHUNKER_PARAMS, sparse=False) reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile()))) assert reconstructed == b"a" * 20