Merge pull request #8903 from ThomasWaldmann/buzhash64

buzhash64 chunker
2026-06-11 01:41:57 -04:00 · 2025-06-11 08:31:27 +02:00 · 2025-06-11 08:31:27 +02:00 · 9a65d5245d
commit 9a65d5245d
parent 6487a9875c d23704e112
17 changed files with 519 additions and 16 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,6 +7,7 @@ src/borg/compress.c
 src/borg/crypto/low_level.c
 src/borg/item.c
 src/borg/chunkers/buzhash.c
+src/borg/chunkers/buzhash64.c
 src/borg/chunkers/reader.c
 src/borg/checksums.c
 src/borg/platform/darwin.c
--- a/docs/internals.rst
+++ b/docs/internals.rst
@ -19,8 +19,8 @@ specified when the backup was performed.
 Deduplication is performed globally across all data in the repository
 (multiple backups and even multiple hosts), both on data and file
 metadata, using :ref:`chunks` created by the chunker using the
-Buzhash_ algorithm ("buzhash" chunker) or a simpler fixed blocksize
-algorithm ("fixed" chunker).
+Buzhash_ algorithm ("buzhash" and "buzhash64" chunker) or a simpler
+fixed blocksize algorithm ("fixed" chunker).

 To perform the repository-wide deduplication, a hash of each
 chunk is checked against the :ref:`chunks cache <cache>`, which is a
--- a/docs/internals/data-structures.rst
+++ b/docs/internals/data-structures.rst
@ -399,6 +399,7 @@ Borg has these chunkers:
  supporting a header block of different size.
 - "buzhash": variable, content-defined blocksize, uses a rolling hash
  computed by the Buzhash_ algorithm.
+- "buzhash64": similar to "buzhash", but improved 64bit implementation

 For some more general usage hints see also ``--chunker-params``.

@ -469,6 +470,16 @@ for the repository, and stored encrypted in the keyfile. This is to prevent
 chunk size based fingerprinting attacks on your encrypted repo contents (to
 guess what files you have based on a specific set of chunk sizes).

+"buzhash64" chunker
+++++++++++++++++++
+
+Similar to "buzhash", but using 64bit wide hash values.
+
+The buzhash table is cryptographically derived from secret key material.
+
+These changes should improve resistance against attacks and also solve
+some of the issues of the original (32bit / XORed table) implementation.
+
 .. _cache:

 The cache
--- a/docs/internals/security.rst
+++ b/docs/internals/security.rst
@ -361,13 +361,19 @@ The chunks stored in the repo are the (compressed, encrypted and authenticated)
 output of the chunker. The sizes of these stored chunks are influenced by the
 compression, encryption and authentication.

-buzhash chunker
-~~~~~~~~~~~~~~~
+buzhash and buzhash64 chunker
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The buzhash chunker chunks according to the input data, the chunker's
-parameters and the secret chunker seed (which all influence the chunk boundary
+The buzhash chunkers chunk according to the input data, the chunker's
+parameters and secret key material (which all influence the chunk boundary
 positions).

+Secret key material:
+
+- "buzhash": chunker seed (32bits), used for XORing the hardcoded buzhash table
+- "buzhash64": bh64_key (256bits) is derived from ID key, used to cryptographically
+  generate the table.
+
 Small files below some specific threshold (default: 512 KiB) result in only one
 chunk (identical content / size as the original file), bigger files result in
 multiple chunks.
--- a/scripts/make.py
+++ b/scripts/make.py
@ -543,6 +543,7 @@ cython_sources = """
 src/borg/compress.pyx
 src/borg/crypto/low_level.pyx
 src/borg/chunkers/buzhash.pyx
+src/borg/chunkers/buzhash64.pyx
 src/borg/chunkers/reader.pyx
 src/borg/hashindex.pyx
 src/borg/item.pyx
--- a/setup.py
+++ b/setup.py
@ -51,6 +51,7 @@ cflags = ["-Wall", "-Wextra", "-Wpointer-arith", "-Wno-unreachable-code-fallthro
 compress_source = "src/borg/compress.pyx"
 crypto_ll_source = "src/borg/crypto/low_level.pyx"
 buzhash_source = "src/borg/chunkers/buzhash.pyx"
+buzhash64_source = "src/borg/chunkers/buzhash64.pyx"
 reader_source = "src/borg/chunkers/reader.pyx"
 hashindex_source = "src/borg/hashindex.pyx"
 item_source = "src/borg/item.pyx"
@ -66,6 +67,7 @@ cython_sources = [
    compress_source,
    crypto_ll_source,
    buzhash_source,
+    buzhash64_source,
    reader_source,
    hashindex_source,
    item_source,
@ -185,6 +187,7 @@ if not on_rtd:
        Extension("borg.hashindex", [hashindex_source], extra_compile_args=cflags),
        Extension("borg.item", [item_source], extra_compile_args=cflags),
        Extension("borg.chunkers.buzhash", [buzhash_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
+        Extension("borg.chunkers.buzhash64", [buzhash64_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
        Extension("borg.chunkers.reader", [reader_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
        Extension("borg.checksums", **checksums_ext_kwargs),
    ]
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -351,7 +351,7 @@ class ChunkBuffer:
        self.packer = msgpack.Packer()
        self.chunks = []
        self.key = key
-        self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed, sparse=False)
+        self.chunker = get_chunker(*chunker_params, key=self.key, sparse=False)
        self.saved_chunks_len = None

    def add(self, item):
@ -1227,7 +1227,7 @@ class FilesystemObjectProcessors:
        self.hlm = HardLinkManager(id_type=tuple, info_type=(list, type(None)))  # (dev, ino) -> chunks or None
        self.stats = Statistics(output_json=log_json, iec=iec)  # threading: done by cache (including progress)
        self.cwd = os.getcwd()
-        self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse)
+        self.chunker = get_chunker(*chunker_params, key=key, sparse=sparse)

    @contextmanager
    def create_helper(self, path, st, status=None, hardlinkable=True, strip_prefix=None):
@ -1502,7 +1502,7 @@ class TarfileObjectProcessors:
        self.print_file_status = file_status_printer or (lambda *args: None)

        self.stats = Statistics(output_json=log_json, iec=iec)  # threading: done by cache (including progress)
-        self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=False)
+        self.chunker = get_chunker(*chunker_params, key=key, sparse=False)
        self.hlm = HardLinkManager(id_type=str, info_type=list)  # path -> chunks

    @contextmanager
@ -2325,7 +2325,7 @@ class ArchiveRecreater:
        target.process_file_chunks = ChunksProcessor(
            cache=self.cache, key=self.key, add_item=target.add_item, rechunkify=target.recreate_rechunkify
        ).process_file_chunks
-        target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed, sparse=False)
+        target.chunker = get_chunker(*target.chunker_params, key=self.key, sparse=False)
        return target

    def create_target_archive(self, name):
--- a/src/borg/archiver/benchmark_cmd.py
+++ b/src/borg/archiver/benchmark_cmd.py
@ -146,7 +146,8 @@ class BenchmarkMixIn:
                    pass

        for spec, func in [
-            ("buzhash,19,23,21,4095", lambda: chunkit("buzhash", 19, 23, 21, 4095, seed=0, sparse=False)),
+            ("buzhash,19,23,21,4095", lambda: chunkit("buzhash", 19, 23, 21, 4095, sparse=False)),
+            ("buzhash64,19,23,21,4095", lambda: chunkit("buzhash64", 19, 23, 21, 4095, sparse=False)),
            ("fixed,1048576", lambda: chunkit("fixed", 1048576, sparse=False)),
        ]:
            print(f"{spec:<24} {size:<10} {timeit(func, number=100):.3f}s")
--- a/src/borg/archiver/transfer_cmd.py
+++ b/src/borg/archiver/transfer_cmd.py
@ -41,7 +41,7 @@ def transfer_chunks(
        file = ChunkIteratorFileWrapper(chunk_iterator)

        # Create a chunker with the specified parameters
-        chunker = get_chunker(*chunker_params, seed=archive.key.chunk_seed, sparse=False)
+        chunker = get_chunker(*chunker_params, key=archive.key, sparse=False)
        for chunk in chunker.chunkify(file):
            if not dry_run:
                chunk_id, data = cached_hash(chunk, archive.key.id_hash)
--- a/src/borg/chunkers/init.py
+++ b/src/borg/chunkers/init.py
@ -1,4 +1,5 @@
 from .buzhash import Chunker
+from .buzhash64 import ChunkerBuzHash64
 from .failing import ChunkerFailing
 from .fixed import ChunkerFixed
 from .reader import *  # noqa
@ -7,12 +8,17 @@ API_VERSION = "1.2_01"


 def get_chunker(algo, *params, **kw):
+    key = kw.get("key", None)
+    sparse = kw.get("sparse", False)
+    # key.chunk_seed only has 32bits
+    seed = key.chunk_seed if key is not None else 0
+    # for buzhash64, we want a much longer key, so we derive it from the id key
+    bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b""
    if algo == "buzhash":
-        seed = kw["seed"]
-        sparse = kw["sparse"]
        return Chunker(seed, *params, sparse=sparse)
+    if algo == "buzhash64":
+        return ChunkerBuzHash64(bh64_key, *params, sparse=sparse)
    if algo == "fixed":
-        sparse = kw["sparse"]
        return ChunkerFixed(*params, sparse=sparse)
    if algo == "fail":
        return ChunkerFailing(*params)
--- a/src/borg/chunkers/buzhash64.pyi
+++ b/src/borg/chunkers/buzhash64.pyi
@ -0,0 +1,20 @@
+from typing import List, Iterator, BinaryIO
+
+from .reader import fmap_entry
+
+API_VERSION: str
+
+def buzhash64(data: bytes, key: bytes) -> int: ...
+def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ...
+
+class ChunkerBuzHash64:
+    def __init__(
+        self,
+        key: bytes,
+        chunk_min_exp: int,
+        chunk_max_exp: int,
+        hash_mask_bits: int,
+        hash_window_size: int,
+        sparse: bool = False,
+    ) -> None: ...
+    def chunkify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...
--- a/src/borg/chunkers/buzhash64.pyx
+++ b/src/borg/chunkers/buzhash64.pyx
@ -0,0 +1,291 @@
+# cython: language_level=3
+
+API_VERSION = '1.2_01'
+
+import cython
+import time
+from hashlib import sha256
+
+from cpython.bytes cimport PyBytes_AsString
+from libc.stdint cimport uint8_t, uint64_t
+from libc.stdlib cimport malloc, free
+from libc.string cimport memcpy, memmove
+
+from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
+from .reader import FileReader, Chunk
+
+# Cyclic polynomial / buzhash
+#
+# https://en.wikipedia.org/wiki/Rolling_hash
+#
+# http://www.serve.net/buz/Notes.1st.year/HTML/C6/rand.012.html (by "BUZ", the inventor)
+#
+# http://www.dcs.gla.ac.uk/~hamer/cakes-talk.pdf (see buzhash slide)
+#
+# Some properties of buzhash / of this implementation:
+#
+# (1) the hash is designed for inputs <= 64 bytes, but the chunker uses it on a 4095 byte window;
+#     any repeating bytes at distance 64 within those 4095 bytes can cause cancellation within
+#     the hash function, e.g. in "X <any 63 bytes> X", the last X would cancel out the influence
+#     of the first X on the hash value.
+
+# This seems to be the most reliable way to inline this code, using a C preprocessor macro:
+cdef extern from *:
+   """
+   #define BARREL_SHIFT64(v, shift) (((v) << (shift)) | ((v) >> (((64 - (shift)) & 0x3f))))
+   """
+   uint64_t BARREL_SHIFT64(uint64_t v, uint64_t shift)
+
+
+@cython.boundscheck(False)  # Deactivate bounds checking
+@cython.wraparound(False)  # Deactivate negative indexing.
+cdef uint64_t* buzhash64_init_table(bytes key):
+    """Initialize the buzhash table using the given key."""
+    cdef int i
+    cdef uint64_t* table = <uint64_t*>malloc(2048)  # 256 * sizeof(uint64_t)
+    for i in range(256):
+        # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key:
+        v = f"{i:02x}".encode() + key
+        d64 = sha256(v).digest()[:8]
+        table[i] = <uint64_t> int.from_bytes(d64, byteorder='little')
+    return table
+
+
+@cython.boundscheck(False)  # Deactivate bounds checking
+@cython.wraparound(False)  # Deactivate negative indexing.
+@cython.cdivision(True)  # Use C division/modulo semantics for integer division.
+cdef uint64_t _buzhash64(const unsigned char* data, size_t len, const uint64_t* h):
+    """Calculate the buzhash of the given data."""
+    cdef uint64_t i
+    cdef uint64_t sum = 0, imod
+    for i in range(len - 1, 0, -1):
+        imod = i & 0x3f
+        sum ^= BARREL_SHIFT64(h[data[0]], imod)
+        data += 1
+    return sum ^ h[data[0]]
+
+
+@cython.boundscheck(False)  # Deactivate bounds checking
+@cython.wraparound(False)  # Deactivate negative indexing.
+@cython.cdivision(True)  # Use C division/modulo semantics for integer division.
+cdef uint64_t _buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, const uint64_t* h):
+    """Update the buzhash with a new byte."""
+    cdef uint64_t lenmod = len & 0x3f
+    return BARREL_SHIFT64(sum, 1) ^ BARREL_SHIFT64(h[remove], lenmod) ^ h[add]
+
+
+cdef class ChunkerBuzHash64:
+    """
+    Content-Defined Chunker, variable chunk sizes.
+
+    This chunker makes quite some effort to cut mostly chunks of the same-content, even if
+    the content moves to a different offset inside the file. It uses the buzhash
+    rolling-hash algorithm to identify the chunk cutting places by looking at the
+    content inside the moving window and computing the rolling hash value over the
+    window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
+    Additionally it obeys some more criteria, like a minimum and maximum chunk size.
+    It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
+    """
+    cdef uint64_t chunk_mask
+    cdef uint64_t* table
+    cdef uint8_t* data
+    cdef object _fd  # Python object for file descriptor
+    cdef int fh
+    cdef int done, eof
+    cdef size_t min_size, buf_size, window_size, remaining, position, last
+    cdef long long bytes_read, bytes_yielded  # off_t in C, using long long for compatibility
+    cdef readonly float chunking_time
+    cdef object file_reader  # FileReader instance
+    cdef size_t reader_block_size
+    cdef bint sparse
+
+    def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
+        min_size = 1 << chunk_min_exp
+        max_size = 1 << chunk_max_exp
+        assert max_size <= len(zeros)
+        # see chunker_process, first while loop condition, first term must be able to get True:
+        assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
+
+        self.window_size = hash_window_size
+        self.chunk_mask = (1 << hash_mask_bits) - 1
+        self.min_size = min_size
+        self.table = buzhash64_init_table(key)
+        self.buf_size = max_size
+        self.data = <uint8_t*>malloc(self.buf_size)
+        self.fh = -1
+        self.done = 0
+        self.eof = 0
+        self.remaining = 0
+        self.position = 0
+        self.last = 0
+        self.bytes_read = 0
+        self.bytes_yielded = 0
+        self._fd = None
+        self.chunking_time = 0.0
+        self.reader_block_size = 1024 * 1024
+        self.sparse = sparse
+
+    def __dealloc__(self):
+        """Free the chunker's resources."""
+        if self.table != NULL:
+            free(self.table)
+            self.table = NULL
+        if self.data != NULL:
+            free(self.data)
+            self.data = NULL
+
+    cdef int fill(self) except 0:
+        """Fill the chunker's buffer with more data."""
+        cdef ssize_t n
+        cdef object chunk
+
+        # Move remaining data to the beginning of the buffer
+        memmove(self.data, self.data + self.last, self.position + self.remaining - self.last)
+        self.position -= self.last
+        self.last = 0
+        n = self.buf_size - self.position - self.remaining
+
+        if self.eof or n == 0:
+            return 1
+
+        # Use FileReader to read data
+        chunk = self.file_reader.read(n)
+        n = chunk.meta["size"]
+
+        if n > 0:
+            # Only copy data if it's not a hole
+            if chunk.meta["allocation"] == CH_DATA:
+                # Copy data from chunk to our buffer
+                memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(chunk.data), n)
+            else:
+                # For holes, fill with zeros
+                memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(zeros[:n]), n)
+
+            self.remaining += n
+            self.bytes_read += n
+        else:
+            self.eof = 1
+
+        return 1
+
+    cdef object process(self) except *:
+        """Process the chunker's buffer and return the next chunk."""
+        cdef uint64_t sum, chunk_mask = self.chunk_mask
+        cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size
+        cdef uint8_t* p
+        cdef uint8_t* stop_at
+        cdef size_t did_bytes
+
+        if self.done:
+            if self.bytes_read == self.bytes_yielded:
+                raise StopIteration
+            else:
+                raise Exception("chunkifier byte count mismatch")
+
+        while self.remaining < min_size + window_size + 1 and not self.eof:  # see assert in Chunker init
+            if not self.fill():
+                return None
+
+        # Here we either are at eof...
+        if self.eof:
+            self.done = 1
+            if self.remaining:
+                self.bytes_yielded += self.remaining
+                # Return a memory view of the remaining data
+                return memoryview((self.data + self.position)[:self.remaining])
+            else:
+                if self.bytes_read == self.bytes_yielded:
+                    raise StopIteration
+                else:
+                    raise Exception("chunkifier byte count mismatch")
+
+        # ... or we have at least min_size + window_size + 1 bytes remaining.
+        # We do not want to "cut" a chunk smaller than min_size and the hash
+        # window starts at the potential cutting place.
+        self.position += min_size
+        self.remaining -= min_size
+        sum = _buzhash64(self.data + self.position, window_size, self.table)
+
+        while self.remaining > self.window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size):
+            p = self.data + self.position
+            stop_at = p + self.remaining - window_size
+
+            while p < stop_at and (sum & chunk_mask):
+                sum = _buzhash64_update(sum, p[0], p[window_size], window_size, self.table)
+                p += 1
+
+            did_bytes = p - (self.data + self.position)
+            self.position += did_bytes
+            self.remaining -= did_bytes
+
+            if self.remaining <= window_size:
+                if not self.fill():
+                    return None
+
+        if self.remaining <= window_size:
+            self.position += self.remaining
+            self.remaining = 0
+
+        old_last = self.last
+        self.last = self.position
+        n = self.last - old_last
+        self.bytes_yielded += n
+
+        # Return a memory view of the chunk
+        return memoryview((self.data + old_last)[:n])
+
+    def chunkify(self, fd, fh=-1, fmap=None):
+        """
+        Cut a file into chunks.
+
+        :param fd: Python file object
+        :param fh: OS-level file handle (if available),
+                   defaults to -1 which means not to use OS-level fd.
+        :param fmap: a file map, same format as generated by sparsemap
+        """
+        self._fd = fd
+        self.fh = fh
+        self.file_reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size, sparse=self.sparse, fmap=fmap)
+        self.done = 0
+        self.remaining = 0
+        self.bytes_read = 0
+        self.bytes_yielded = 0
+        self.position = 0
+        self.last = 0
+        self.eof = 0
+        return self
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        started_chunking = time.monotonic()
+        data = self.process()
+        got = len(data)
+        # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code,
+        # but we can just check if data was all-zero (and either came from a hole
+        # or from stored zeros - we can not detect that here).
+        if zeros.startswith(data):
+            data = None
+            allocation = CH_ALLOC
+        else:
+            allocation = CH_DATA
+        self.chunking_time += time.monotonic() - started_chunking
+        return Chunk(data, size=got, allocation=allocation)
+
+
+def buzhash64(data, bytes key):
+    cdef uint64_t *table
+    cdef uint64_t sum
+    table = buzhash64_init_table(key)
+    sum = _buzhash64(<const unsigned char *> data, len(data), table)
+    free(table)
+    return sum
+
+
+def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, bytes key):
+    cdef uint64_t *table
+    table = buzhash64_init_table(key)
+    sum = _buzhash64_update(sum, remove, add, len, table)
+    free(table)
+    return sum
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@ -92,6 +92,7 @@ MAX_SEGMENT_DIR_INDEX = 2**32 - 1

 # chunker algorithms
 CH_BUZHASH = "buzhash"
+CH_BUZHASH64 = "buzhash64"
 CH_FIXED = "fixed"
 CH_FAIL = "fail"

@ -103,6 +104,7 @@ HASH_MASK_BITS = 21  # results in ~2MiB chunks statistically

 # defaults, use --chunker-params to override
 CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
+CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)

 # chunker params for the items metadata stream, finer granularity
 ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE)
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@ -187,6 +187,21 @@ def ChunkerParams(s):
        return algo, block_size, header_size
    if algo == "default" and count == 1:  # default
        return CHUNKER_PARAMS
+    if algo == CH_BUZHASH64 and count == 5:  # buzhash64, chunk_min, chunk_max, chunk_mask, window_size
+        chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[1:])
+        if not (chunk_min <= chunk_mask <= chunk_max):
+            raise argparse.ArgumentTypeError("required: chunk_min <= chunk_mask <= chunk_max")
+        if chunk_min < 6:
+            # see comment in 'fixed' algo check
+            raise argparse.ArgumentTypeError(
+                "min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)"
+            )
+        if chunk_max > 23:
+            raise argparse.ArgumentTypeError(
+                "max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)"
+            )
+        # note that for buzhash64, there is no problem with even window_size.
+        return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size
    # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
    if algo == CH_BUZHASH and count == 5 or count == 4:  # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
        chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4 :])
--- a/src/borg/testsuite/chunkers/buzhash64_self_test.py
+++ b/src/borg/testsuite/chunkers/buzhash64_self_test.py
@ -0,0 +1,77 @@
+# Note: these tests are part of the self test, do not use or import pytest functionality here.
+#       See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
+
+from io import BytesIO
+
+from ...chunkers import get_chunker
+from ...chunkers.buzhash64 import buzhash64, buzhash64_update, ChunkerBuzHash64
+from ...constants import *  # NOQA
+from .. import BaseTestCase
+from . import cf
+
+
+class ChunkerBuzHash64TestCase(BaseTestCase):
+    def test_chunkify64(self):
+        data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y"
+        parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
+        self.assert_equal(len(parts), 2)
+        self.assert_equal(b"".join(parts), data)
+        self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
+        self.assert_equal(
+            cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"],
+        )
+        self.assert_equal(
+            cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"],
+        )
+        self.assert_equal(
+            cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
+        )
+        self.assert_equal(
+            cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
+        )
+        self.assert_equal(
+            cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"],
+        )
+        self.assert_equal(
+            cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
+        )
+        self.assert_equal(
+            cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"],
+        )
+        self.assert_equal(
+            cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
+        )
+        self.assert_equal(
+            cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
+            [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
+        )
+
+    def test_buzhash64(self):
+        self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478)
+        self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910)
+        expected = buzhash64(b"abcdefghijklmnop", b"1")
+        previous = buzhash64(b"Xabcdefghijklmno", b"1")
+        this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1")
+        self.assert_equal(this, expected)
+        # Test with more than 63 bytes to make sure our barrel_shift macro works correctly
+        self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899)
+
+    def test_small_reads64(self):
+        class SmallReadFile:
+            input = b"a" * (20 + 1)
+
+            def read(self, nbytes):
+                self.input = self.input[:-1]
+                return self.input[:1]
+
+        chunker = get_chunker(*CHUNKER64_PARAMS, sparse=False)
+        reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile())))
+        assert reconstructed == b"a" * 20
--- a/src/borg/testsuite/chunkers/buzhash64_test.py
+++ b/src/borg/testsuite/chunkers/buzhash64_test.py
@ -0,0 +1,69 @@
+from hashlib import sha256
+from io import BytesIO
+import os
+
+from . import cf
+from ...chunkers import ChunkerBuzHash64
+from ...constants import *  # NOQA
+from ...helpers import hex_to_bin
+
+
+def H(data):
+    return sha256(data).digest()
+
+
+def test_chunkpoints64_unchanged():
+    def twist(size):
+        x = 1
+        a = bytearray(size)
+        for i in range(size):
+            x = (x * 1103515245 + 12345) & 0x7FFFFFFF
+            a[i] = x & 0xFF
+        return a
+
+    data = twist(100000)
+
+    runs = []
+    for winsize in (65, 129, HASH_WINDOW_SIZE, 7351):
+        for minexp in (4, 6, 7, 11, 12):
+            for maxexp in (15, 17):
+                if minexp >= maxexp:
+                    continue
+                for maskbits in (4, 7, 10, 12):
+                    for key in (b"first_key", b"second_key"):
+                        fh = BytesIO(data)
+                        chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize)
+                        chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
+                        runs.append(H(b"".join(chunks)))
+
+    # The "correct" hash below matches the existing chunker behavior.
+    # Future chunker optimisations must not change this, or existing repos will bloat.
+    overall_hash = H(b"".join(runs))
+    assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc")
+
+
+def test_buzhash64_chunksize_distribution():
+    data = os.urandom(1048576)
+    min_exp, max_exp, mask = 10, 16, 14  # chunk size target 16kiB, clip at 1kiB and 64kiB
+    chunker = ChunkerBuzHash64(b"", min_exp, max_exp, mask, 4095)
+    f = BytesIO(data)
+    chunks = cf(chunker.chunkify(f))
+    del chunks[-1]  # get rid of the last chunk, it can be smaller than 2**min_exp
+    chunk_sizes = [len(chunk) for chunk in chunks]
+    chunks_count = len(chunks)
+    min_chunksize_observed = min(chunk_sizes)
+    max_chunksize_observed = max(chunk_sizes)
+    min_count = sum(int(size == 2**min_exp) for size in chunk_sizes)
+    max_count = sum(int(size == 2**max_exp) for size in chunk_sizes)
+    print(
+        f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} "
+        f"min count: {min_count} max count: {max_count}"
+    )
+    # usually there will about 64 chunks
+    assert 32 < chunks_count < 128
+    # chunks always must be between min and max (clipping must work):
+    assert min_chunksize_observed >= 2**min_exp
+    assert max_chunksize_observed <= 2**max_exp
+    # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size:
+    assert min_count < 10
+    assert max_count < 10
--- a/src/borg/testsuite/chunkers/buzhash_self_test.py
+++ b/src/borg/testsuite/chunkers/buzhash_self_test.py
@ -69,6 +69,6 @@ class ChunkerTestCase(BaseTestCase):
                self.input = self.input[:-1]
                return self.input[:1]

-        chunker = get_chunker(*CHUNKER_PARAMS, seed=0, sparse=False)
+        chunker = get_chunker(*CHUNKER_PARAMS, sparse=False)
        reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile())))
        assert reconstructed == b"a" * 20