From cbe6ba719d01b9f49762f4b8a39af606a127fbe2 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 5 Jun 2025 19:15:11 +0200 Subject: [PATCH 1/8] chunkers: prepare for buzhash64 Added some *64*.* files that are just 1:1 copies of their 32bit counterparts, so that the changes for the 64bit adaption will later be better visible. --- src/borg/chunkers/buzhash64.pyi | 20 ++ src/borg/chunkers/buzhash64.pyx | 332 ++++++++++++++++++ .../testsuite/chunkers/buzhash64_self_test.py | 74 ++++ src/borg/testsuite/chunkers/buzhash64_test.py | 69 ++++ 4 files changed, 495 insertions(+) create mode 100644 src/borg/chunkers/buzhash64.pyi create mode 100644 src/borg/chunkers/buzhash64.pyx create mode 100644 src/borg/testsuite/chunkers/buzhash64_self_test.py create mode 100644 src/borg/testsuite/chunkers/buzhash64_test.py diff --git a/src/borg/chunkers/buzhash64.pyi b/src/borg/chunkers/buzhash64.pyi new file mode 100644 index 000000000..16f9d46bb --- /dev/null +++ b/src/borg/chunkers/buzhash64.pyi @@ -0,0 +1,20 @@ +from typing import List, Iterator, BinaryIO + +from .reader import fmap_entry + +API_VERSION: str + +def buzhash(data: bytes, seed: int) -> int: ... +def buzhash_update(sum: int, remove: int, add: int, len: int, seed: int) -> int: ... + +class Chunker: + def __init__( + self, + seed: int, + chunk_min_exp: int, + chunk_max_exp: int, + hash_mask_bits: int, + hash_window_size: int, + sparse: bool = False, + ) -> None: ... + def chunkify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ... diff --git a/src/borg/chunkers/buzhash64.pyx b/src/borg/chunkers/buzhash64.pyx new file mode 100644 index 000000000..974a02707 --- /dev/null +++ b/src/borg/chunkers/buzhash64.pyx @@ -0,0 +1,332 @@ +# cython: language_level=3 + +API_VERSION = '1.2_01' + +import cython +import time +from cpython.bytes cimport PyBytes_AsString +from libc.stdint cimport uint8_t, uint32_t +from libc.stdlib cimport malloc, free +from libc.string cimport memcpy, memmove + +from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros +from .reader import FileReader, Chunk + +# Cyclic polynomial / buzhash +# +# https://en.wikipedia.org/wiki/Rolling_hash +# +# http://www.serve.net/buz/Notes.1st.year/HTML/C6/rand.012.html (by "BUZ", the inventor) +# +# http://www.dcs.gla.ac.uk/~hamer/cakes-talk.pdf (see buzhash slide) +# +# Some properties of buzhash / of this implementation: +# +# (1) the hash is designed for inputs <= 32 bytes, but the chunker uses it on a 4095 byte window; +# any repeating bytes at distance 32 within those 4095 bytes can cause cancellation within +# the hash function, e.g. in "X X", the last X would cancel out the influence +# of the first X on the hash value. +# +# (2) the hash table is supposed to have (according to the BUZ) exactly a 50% distribution of +# 0/1 bit values per position, but the hard coded table below doesn't fit that property. +# +# (3) if you would use a window size divisible by 64, the seed would cancel itself out completely. +# this is why we use a window size of 4095 bytes. +# +# Another quirk is that, even with the 4095 byte window, XORing the entire table by a constant +# is equivalent to XORing the hash output with a different constant. but since the seed is stored +# encrypted, i think it still serves its purpose. + +cdef uint32_t table_base[256] +table_base = [ + 0xe7f831ec, 0xf4026465, 0xafb50cae, 0x6d553c7a, 0xd639efe3, 0x19a7b895, 0x9aba5b21, 0x5417d6d4, + 0x35fd2b84, 0xd1f6a159, 0x3f8e323f, 0xb419551c, 0xf444cebf, 0x21dc3b80, 0xde8d1e36, 0x84a32436, + 0xbeb35a9d, 0xa36f24aa, 0xa4e60186, 0x98d18ffe, 0x3f042f9e, 0xdb228bcd, 0x096474b7, 0x5c20c2f7, + 0xf9eec872, 0xe8625275, 0xb9d38f80, 0xd48eb716, 0x22a950b4, 0x3cbaaeaa, 0xc37cddd3, 0x8fea6f6a, + 0x1d55d526, 0x7fd6d3b3, 0xdaa072ee, 0x4345ac40, 0xa077c642, 0x8f2bd45b, 0x28509110, 0x55557613, + 0xffc17311, 0xd961ffef, 0xe532c287, 0xaab95937, 0x46d38365, 0xb065c703, 0xf2d91d0f, 0x92cd4bb0, + 0x4007c712, 0xf35509dd, 0x505b2f69, 0x557ead81, 0x310f4563, 0xbddc5be8, 0x9760f38c, 0x701e0205, + 0x00157244, 0x14912826, 0xdc4ca32b, 0x67b196de, 0x5db292e8, 0x8c1b406b, 0x01f34075, 0xfa2520f7, + 0x73bc37ab, 0x1e18bc30, 0xfe2c6cb3, 0x20c522d0, 0x5639e3db, 0x942bda35, 0x899af9d1, 0xced44035, + 0x98cc025b, 0x255f5771, 0x70fefa24, 0xe928fa4d, 0x2c030405, 0xb9325590, 0x20cb63bd, 0xa166305d, + 0x80e52c0a, 0xa8fafe2f, 0x1ad13f7d, 0xcfaf3685, 0x6c83a199, 0x7d26718a, 0xde5dfcd9, 0x79cf7355, + 0x8979d7fb, 0xebf8c55e, 0xebe408e4, 0xcd2affba, 0xe483be6e, 0xe239d6de, 0x5dc1e9e0, 0x0473931f, + 0x851b097c, 0xac5db249, 0x09c0f9f2, 0xd8d2f134, 0xe6f38e41, 0xb1c71bf1, 0x52b6e4db, 0x07224424, + 0x6cf73e85, 0x4f25d89c, 0x782a7d74, 0x10a68dcd, 0x3a868189, 0xd570d2dc, 0x69630745, 0x9542ed86, + 0x331cd6b2, 0xa84b5b28, 0x07879c9d, 0x38372f64, 0x7185db11, 0x25ba7c83, 0x01061523, 0xe6792f9f, + 0xe5df07d1, 0x4321b47f, 0x7d2469d8, 0x1a3a4f90, 0x48be29a3, 0x669071af, 0x8ec8dd31, 0x0810bfbf, + 0x813a06b4, 0x68538345, 0x65865ddc, 0x43a71b8e, 0x78619a56, 0x5a34451d, 0x5bdaa3ed, 0x71edc7e9, + 0x17ac9a20, 0x78d10bfa, 0x6c1e7f35, 0xd51839d9, 0x240cbc51, 0x33513cc1, 0xd2b4f795, 0xccaa8186, + 0x0babe682, 0xa33cf164, 0x18c643ea, 0xc1ca105f, 0x9959147a, 0x6d3d94de, 0x0b654fbe, 0xed902ca0, + 0x7d835cb5, 0x99ba1509, 0x6445c922, 0x495e76c2, 0xf07194bc, 0xa1631d7e, 0x677076a5, 0x89fffe35, + 0x1a49bcf3, 0x8e6c948a, 0x0144c917, 0x8d93aea1, 0x16f87ddf, 0xc8f25d49, 0x1fb11297, 0x27e750cd, + 0x2f422da1, 0xdee89a77, 0x1534c643, 0x457b7b8b, 0xaf172f7a, 0x6b9b09d6, 0x33573f7f, 0xf14e15c4, + 0x526467d5, 0xaf488241, 0x87c3ee0d, 0x33be490c, 0x95aa6e52, 0x43ec242e, 0xd77de99b, 0xd018334f, + 0x5b78d407, 0x498eb66b, 0xb1279fa8, 0xb38b0ea6, 0x90718376, 0xe325dee2, 0x8e2f2cba, 0xcaa5bdec, + 0x9d652c56, 0xad68f5cb, 0xa77591af, 0x88e37ee8, 0xf8faa221, 0xfcbbbe47, 0x4f407786, 0xaf393889, + 0xf444a1d9, 0x15ae1a2f, 0x40aa7097, 0x6f9486ac, 0x29d232a3, 0xe47609e9, 0xe8b631ff, 0xba8565f4, + 0x11288749, 0x46c9a838, 0xeb1b7cd8, 0xf516bbb1, 0xfb74fda0, 0x010996e6, 0x4c994653, 0x1d889512, + 0x53dcd9a3, 0xdd074697, 0x1e78e17c, 0x637c98bf, 0x930bb219, 0xcf7f75b0, 0xcb9355fb, 0x9e623009, + 0xe466d82c, 0x28f968d3, 0xfeb385d9, 0x238e026c, 0xb8ed0560, 0x0c6a027a, 0x3d6fec4b, 0xbb4b2ec2, + 0xe715031c, 0xeded011d, 0xcdc4d3b9, 0xc456fc96, 0xdd0eea20, 0xb3df8ec9, 0x12351993, 0xd9cbb01c, + 0x603147a2, 0xcf37d17d, 0xf7fcd9dc, 0xd8556fa3, 0x104c8131, 0x13152774, 0xb4715811, 0x6a72c2c9, + 0xc5ae37bb, 0xa76ce12a, 0x8150d8f3, 0x2ec29218, 0xa35f0984, 0x48c0647e, 0x0b5ff98c, 0x71893f7b +] + +# This seems to be the most reliable way to inline this code, using a C preprocessor macro: +cdef extern from *: + """ + #define BARREL_SHIFT(v, shift) (((v) << (shift)) | ((v) >> (((32 - (shift)) & 0x1f)))) + """ + uint32_t BARREL_SHIFT(uint32_t v, uint32_t shift) + + +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +cdef uint32_t* buzhash_init_table(uint32_t seed): + """Initialize the buzhash table with the given seed.""" + cdef int i + cdef uint32_t* table = malloc(1024) # 256 * sizeof(uint32_t) + for i in range(256): + table[i] = table_base[i] ^ seed + return table + + +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +@cython.cdivision(True) # Use C division/modulo semantics for integer division. +cdef uint32_t _buzhash(const unsigned char* data, size_t len, const uint32_t* h): + """Calculate the buzhash of the given data.""" + cdef uint32_t i + cdef uint32_t sum = 0, imod + for i in range(len - 1, 0, -1): + imod = i & 0x1f + sum ^= BARREL_SHIFT(h[data[0]], imod) + data += 1 + return sum ^ h[data[0]] + + +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +@cython.cdivision(True) # Use C division/modulo semantics for integer division. +cdef uint32_t _buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t* h): + """Update the buzhash with a new byte.""" + cdef uint32_t lenmod = len & 0x1f + return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], lenmod) ^ h[add] + + +cdef class Chunker: + """ + Content-Defined Chunker, variable chunk sizes. + + This chunker makes quite some effort to cut mostly chunks of the same-content, even if + the content moves to a different offset inside the file. It uses the buzhash + rolling-hash algorithm to identify the chunk cutting places by looking at the + content inside the moving window and computing the rolling hash value over the + window contents. If the last n bits of the rolling hash are 0, a chunk is cut. + Additionally it obeys some more criteria, like a minimum and maximum chunk size. + It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks. + """ + cdef uint32_t chunk_mask + cdef uint32_t* table + cdef uint8_t* data + cdef object _fd # Python object for file descriptor + cdef int fh + cdef int done, eof + cdef size_t min_size, buf_size, window_size, remaining, position, last + cdef long long bytes_read, bytes_yielded # off_t in C, using long long for compatibility + cdef readonly float chunking_time + cdef object file_reader # FileReader instance + cdef size_t reader_block_size + cdef bint sparse + + def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False): + min_size = 1 << chunk_min_exp + max_size = 1 << chunk_max_exp + assert max_size <= len(zeros) + # see chunker_process, first while loop condition, first term must be able to get True: + assert hash_window_size + min_size + 1 <= max_size, "too small max_size" + + self.window_size = hash_window_size + self.chunk_mask = (1 << hash_mask_bits) - 1 + self.min_size = min_size + self.table = buzhash_init_table(seed & 0xffffffff) + self.buf_size = max_size + self.data = malloc(self.buf_size) + self.fh = -1 + self.done = 0 + self.eof = 0 + self.remaining = 0 + self.position = 0 + self.last = 0 + self.bytes_read = 0 + self.bytes_yielded = 0 + self._fd = None + self.chunking_time = 0.0 + self.reader_block_size = 1024 * 1024 + self.sparse = sparse + + def __dealloc__(self): + """Free the chunker's resources.""" + if self.table != NULL: + free(self.table) + self.table = NULL + if self.data != NULL: + free(self.data) + self.data = NULL + + cdef int fill(self) except 0: + """Fill the chunker's buffer with more data.""" + cdef ssize_t n + cdef object chunk + + # Move remaining data to the beginning of the buffer + memmove(self.data, self.data + self.last, self.position + self.remaining - self.last) + self.position -= self.last + self.last = 0 + n = self.buf_size - self.position - self.remaining + + if self.eof or n == 0: + return 1 + + # Use FileReader to read data + chunk = self.file_reader.read(n) + n = chunk.meta["size"] + + if n > 0: + # Only copy data if it's not a hole + if chunk.meta["allocation"] == CH_DATA: + # Copy data from chunk to our buffer + memcpy(self.data + self.position + self.remaining, PyBytes_AsString(chunk.data), n) + else: + # For holes, fill with zeros + memcpy(self.data + self.position + self.remaining, PyBytes_AsString(zeros[:n]), n) + + self.remaining += n + self.bytes_read += n + else: + self.eof = 1 + + return 1 + + cdef object process(self) except *: + """Process the chunker's buffer and return the next chunk.""" + cdef uint32_t sum, chunk_mask = self.chunk_mask + cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size + cdef uint8_t* p + cdef uint8_t* stop_at + cdef size_t did_bytes + + if self.done: + if self.bytes_read == self.bytes_yielded: + raise StopIteration + else: + raise Exception("chunkifier byte count mismatch") + + while self.remaining < min_size + window_size + 1 and not self.eof: # see assert in Chunker init + if not self.fill(): + return None + + # Here we either are at eof... + if self.eof: + self.done = 1 + if self.remaining: + self.bytes_yielded += self.remaining + # Return a memory view of the remaining data + return memoryview((self.data + self.position)[:self.remaining]) + else: + if self.bytes_read == self.bytes_yielded: + raise StopIteration + else: + raise Exception("chunkifier byte count mismatch") + + # ... or we have at least min_size + window_size + 1 bytes remaining. + # We do not want to "cut" a chunk smaller than min_size and the hash + # window starts at the potential cutting place. + self.position += min_size + self.remaining -= min_size + sum = _buzhash(self.data + self.position, window_size, self.table) + + while self.remaining > self.window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size): + p = self.data + self.position + stop_at = p + self.remaining - window_size + + while p < stop_at and (sum & chunk_mask): + sum = _buzhash_update(sum, p[0], p[window_size], window_size, self.table) + p += 1 + + did_bytes = p - (self.data + self.position) + self.position += did_bytes + self.remaining -= did_bytes + + if self.remaining <= window_size: + if not self.fill(): + return None + + if self.remaining <= window_size: + self.position += self.remaining + self.remaining = 0 + + old_last = self.last + self.last = self.position + n = self.last - old_last + self.bytes_yielded += n + + # Return a memory view of the chunk + return memoryview((self.data + old_last)[:n]) + + def chunkify(self, fd, fh=-1, fmap=None): + """ + Cut a file into chunks. + + :param fd: Python file object + :param fh: OS-level file handle (if available), + defaults to -1 which means not to use OS-level fd. + :param fmap: a file map, same format as generated by sparsemap + """ + self._fd = fd + self.fh = fh + self.file_reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size, sparse=self.sparse, fmap=fmap) + self.done = 0 + self.remaining = 0 + self.bytes_read = 0 + self.bytes_yielded = 0 + self.position = 0 + self.last = 0 + self.eof = 0 + return self + + def __iter__(self): + return self + + def __next__(self): + started_chunking = time.monotonic() + data = self.process() + got = len(data) + # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code, + # but we can just check if data was all-zero (and either came from a hole + # or from stored zeros - we can not detect that here). + if zeros.startswith(data): + data = None + allocation = CH_ALLOC + else: + allocation = CH_DATA + self.chunking_time += time.monotonic() - started_chunking + return Chunk(data, size=got, allocation=allocation) + + +def buzhash(data, unsigned long seed): + cdef uint32_t *table + cdef uint32_t sum + table = buzhash_init_table(seed & 0xffffffff) + sum = _buzhash( data, len(data), table) + free(table) + return sum + + +def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed): + cdef uint32_t *table + table = buzhash_init_table(seed & 0xffffffff) + sum = _buzhash_update(sum, remove, add, len, table) + free(table) + return sum diff --git a/src/borg/testsuite/chunkers/buzhash64_self_test.py b/src/borg/testsuite/chunkers/buzhash64_self_test.py new file mode 100644 index 000000000..1c6337047 --- /dev/null +++ b/src/borg/testsuite/chunkers/buzhash64_self_test.py @@ -0,0 +1,74 @@ +# Note: these tests are part of the self test, do not use or import pytest functionality here. +# See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT + +from io import BytesIO + +from ...chunkers import get_chunker +from ...chunkers.buzhash import buzhash, buzhash_update, Chunker +from ...constants import * # NOQA +from .. import BaseTestCase +from . import cf + + +class ChunkerTestCase(BaseTestCase): + def test_chunkify(self): + data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y" + parts = cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))) + self.assert_equal(len(parts), 2) + self.assert_equal(b"".join(parts), data) + self.assert_equal(cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), []) + self.assert_equal( + cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"], + ) + self.assert_equal( + cf(Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"fo", b"obarb", b"oob", b"azf", b"oobarb", b"oob", b"azf", b"oobarb", b"oobaz"], + ) + self.assert_equal( + cf(Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foob", b"ar", b"boobazfoob", b"ar", b"boobazfoob", b"ar", b"boobaz"], + ) + self.assert_equal( + cf(Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), [b"foobarboobaz" * 3] + ) + self.assert_equal( + cf(Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobar", b"boobazfo", b"obar", b"boobazfo", b"obar", b"boobaz"], + ) + self.assert_equal( + cf(Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foob", b"arboobaz", b"foob", b"arboobaz", b"foob", b"arboobaz"], + ) + self.assert_equal( + cf(Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), [b"foobarboobaz" * 3] + ) + self.assert_equal( + cf(Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarbo", b"obazfoobar", b"boobazfo", b"obarboobaz"], + ) + self.assert_equal( + cf(Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"], + ) + + def test_buzhash(self): + self.assert_equal(buzhash(b"abcdefghijklmnop", 0), 3795437769) + self.assert_equal(buzhash(b"abcdefghijklmnop", 1), 3795400502) + self.assert_equal( + buzhash(b"abcdefghijklmnop", 1), buzhash_update(buzhash(b"Xabcdefghijklmno", 1), ord("X"), ord("p"), 16, 1) + ) + # Test with more than 31 bytes to make sure our barrel_shift macro works correctly + self.assert_equal(buzhash(b"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz", 0), 566521248) + + def test_small_reads(self): + class SmallReadFile: + input = b"a" * (20 + 1) + + def read(self, nbytes): + self.input = self.input[:-1] + return self.input[:1] + + chunker = get_chunker(*CHUNKER_PARAMS, seed=0, sparse=False) + reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile()))) + assert reconstructed == b"a" * 20 diff --git a/src/borg/testsuite/chunkers/buzhash64_test.py b/src/borg/testsuite/chunkers/buzhash64_test.py new file mode 100644 index 000000000..3b33a1cc9 --- /dev/null +++ b/src/borg/testsuite/chunkers/buzhash64_test.py @@ -0,0 +1,69 @@ +from hashlib import sha256 +from io import BytesIO +import os + +from . import cf +from ...chunkers import Chunker +from ...constants import * # NOQA +from ...helpers import hex_to_bin + + +def H(data): + return sha256(data).digest() + + +def test_chunkpoints_unchanged(): + def twist(size): + x = 1 + a = bytearray(size) + for i in range(size): + x = (x * 1103515245 + 12345) & 0x7FFFFFFF + a[i] = x & 0xFF + return a + + data = twist(100000) + + runs = [] + for winsize in (65, 129, HASH_WINDOW_SIZE, 7351): + for minexp in (4, 6, 7, 11, 12): + for maxexp in (15, 17): + if minexp >= maxexp: + continue + for maskbits in (4, 7, 10, 12): + for seed in (1849058162, 1234567653): + fh = BytesIO(data) + chunker = Chunker(seed, minexp, maxexp, maskbits, winsize) + chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))] + runs.append(H(b"".join(chunks))) + + # The "correct" hash below matches the existing chunker behavior. + # Future chunker optimisations must not change this, or existing repos will bloat. + overall_hash = H(b"".join(runs)) + assert overall_hash == hex_to_bin("a43d0ecb3ae24f38852fcc433a83dacd28fe0748d09cc73fc11b69cf3f1a7299") + + +def test_buzhash_chunksize_distribution(): + data = os.urandom(1048576) + min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB + chunker = Chunker(0, min_exp, max_exp, mask, 4095) + f = BytesIO(data) + chunks = cf(chunker.chunkify(f)) + del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp + chunk_sizes = [len(chunk) for chunk in chunks] + chunks_count = len(chunks) + min_chunksize_observed = min(chunk_sizes) + max_chunksize_observed = max(chunk_sizes) + min_count = sum(int(size == 2**min_exp) for size in chunk_sizes) + max_count = sum(int(size == 2**max_exp) for size in chunk_sizes) + print( + f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} " + f"min count: {min_count} max count: {max_count}" + ) + # usually there will about 64 chunks + assert 32 < chunks_count < 128 + # chunks always must be between min and max (clipping must work): + assert min_chunksize_observed >= 2**min_exp + assert max_chunksize_observed <= 2**max_exp + # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size: + assert min_count < 10 + assert max_count < 10 From 6a6622f9d8a41b7595eb2a023696965ae8136b7f Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 5 Jun 2025 19:22:46 +0200 Subject: [PATCH 2/8] buzhash64: adapt buzhash and tests for 64bit --- src/borg/chunkers/buzhash64.pyi | 6 +- src/borg/chunkers/buzhash64.pyx | 117 ++++++------------ src/borg/constants.py | 2 + .../testsuite/chunkers/buzhash64_self_test.py | 65 +++++----- src/borg/testsuite/chunkers/buzhash64_test.py | 12 +- 5 files changed, 83 insertions(+), 119 deletions(-) diff --git a/src/borg/chunkers/buzhash64.pyi b/src/borg/chunkers/buzhash64.pyi index 16f9d46bb..4ca5359d3 100644 --- a/src/borg/chunkers/buzhash64.pyi +++ b/src/borg/chunkers/buzhash64.pyi @@ -4,10 +4,10 @@ from .reader import fmap_entry API_VERSION: str -def buzhash(data: bytes, seed: int) -> int: ... -def buzhash_update(sum: int, remove: int, add: int, len: int, seed: int) -> int: ... +def buzhash64(data: bytes, seed: int) -> int: ... +def buzhash64_update(sum: int, remove: int, add: int, len: int, seed: int) -> int: ... -class Chunker: +class ChunkerBuzHash64: def __init__( self, seed: int, diff --git a/src/borg/chunkers/buzhash64.pyx b/src/borg/chunkers/buzhash64.pyx index 974a02707..db264d74a 100644 --- a/src/borg/chunkers/buzhash64.pyx +++ b/src/borg/chunkers/buzhash64.pyx @@ -4,8 +4,10 @@ API_VERSION = '1.2_01' import cython import time +from hashlib import sha256 + from cpython.bytes cimport PyBytes_AsString -from libc.stdint cimport uint8_t, uint32_t +from libc.stdint cimport uint8_t, uint64_t from libc.stdlib cimport malloc, free from libc.string cimport memcpy, memmove @@ -22,86 +24,43 @@ from .reader import FileReader, Chunk # # Some properties of buzhash / of this implementation: # -# (1) the hash is designed for inputs <= 32 bytes, but the chunker uses it on a 4095 byte window; -# any repeating bytes at distance 32 within those 4095 bytes can cause cancellation within -# the hash function, e.g. in "X X", the last X would cancel out the influence +# (1) the hash is designed for inputs <= 64 bytes, but the chunker uses it on a 4095 byte window; +# any repeating bytes at distance 64 within those 4095 bytes can cause cancellation within +# the hash function, e.g. in "X X", the last X would cancel out the influence # of the first X on the hash value. -# -# (2) the hash table is supposed to have (according to the BUZ) exactly a 50% distribution of -# 0/1 bit values per position, but the hard coded table below doesn't fit that property. -# -# (3) if you would use a window size divisible by 64, the seed would cancel itself out completely. -# this is why we use a window size of 4095 bytes. -# -# Another quirk is that, even with the 4095 byte window, XORing the entire table by a constant -# is equivalent to XORing the hash output with a different constant. but since the seed is stored -# encrypted, i think it still serves its purpose. - -cdef uint32_t table_base[256] -table_base = [ - 0xe7f831ec, 0xf4026465, 0xafb50cae, 0x6d553c7a, 0xd639efe3, 0x19a7b895, 0x9aba5b21, 0x5417d6d4, - 0x35fd2b84, 0xd1f6a159, 0x3f8e323f, 0xb419551c, 0xf444cebf, 0x21dc3b80, 0xde8d1e36, 0x84a32436, - 0xbeb35a9d, 0xa36f24aa, 0xa4e60186, 0x98d18ffe, 0x3f042f9e, 0xdb228bcd, 0x096474b7, 0x5c20c2f7, - 0xf9eec872, 0xe8625275, 0xb9d38f80, 0xd48eb716, 0x22a950b4, 0x3cbaaeaa, 0xc37cddd3, 0x8fea6f6a, - 0x1d55d526, 0x7fd6d3b3, 0xdaa072ee, 0x4345ac40, 0xa077c642, 0x8f2bd45b, 0x28509110, 0x55557613, - 0xffc17311, 0xd961ffef, 0xe532c287, 0xaab95937, 0x46d38365, 0xb065c703, 0xf2d91d0f, 0x92cd4bb0, - 0x4007c712, 0xf35509dd, 0x505b2f69, 0x557ead81, 0x310f4563, 0xbddc5be8, 0x9760f38c, 0x701e0205, - 0x00157244, 0x14912826, 0xdc4ca32b, 0x67b196de, 0x5db292e8, 0x8c1b406b, 0x01f34075, 0xfa2520f7, - 0x73bc37ab, 0x1e18bc30, 0xfe2c6cb3, 0x20c522d0, 0x5639e3db, 0x942bda35, 0x899af9d1, 0xced44035, - 0x98cc025b, 0x255f5771, 0x70fefa24, 0xe928fa4d, 0x2c030405, 0xb9325590, 0x20cb63bd, 0xa166305d, - 0x80e52c0a, 0xa8fafe2f, 0x1ad13f7d, 0xcfaf3685, 0x6c83a199, 0x7d26718a, 0xde5dfcd9, 0x79cf7355, - 0x8979d7fb, 0xebf8c55e, 0xebe408e4, 0xcd2affba, 0xe483be6e, 0xe239d6de, 0x5dc1e9e0, 0x0473931f, - 0x851b097c, 0xac5db249, 0x09c0f9f2, 0xd8d2f134, 0xe6f38e41, 0xb1c71bf1, 0x52b6e4db, 0x07224424, - 0x6cf73e85, 0x4f25d89c, 0x782a7d74, 0x10a68dcd, 0x3a868189, 0xd570d2dc, 0x69630745, 0x9542ed86, - 0x331cd6b2, 0xa84b5b28, 0x07879c9d, 0x38372f64, 0x7185db11, 0x25ba7c83, 0x01061523, 0xe6792f9f, - 0xe5df07d1, 0x4321b47f, 0x7d2469d8, 0x1a3a4f90, 0x48be29a3, 0x669071af, 0x8ec8dd31, 0x0810bfbf, - 0x813a06b4, 0x68538345, 0x65865ddc, 0x43a71b8e, 0x78619a56, 0x5a34451d, 0x5bdaa3ed, 0x71edc7e9, - 0x17ac9a20, 0x78d10bfa, 0x6c1e7f35, 0xd51839d9, 0x240cbc51, 0x33513cc1, 0xd2b4f795, 0xccaa8186, - 0x0babe682, 0xa33cf164, 0x18c643ea, 0xc1ca105f, 0x9959147a, 0x6d3d94de, 0x0b654fbe, 0xed902ca0, - 0x7d835cb5, 0x99ba1509, 0x6445c922, 0x495e76c2, 0xf07194bc, 0xa1631d7e, 0x677076a5, 0x89fffe35, - 0x1a49bcf3, 0x8e6c948a, 0x0144c917, 0x8d93aea1, 0x16f87ddf, 0xc8f25d49, 0x1fb11297, 0x27e750cd, - 0x2f422da1, 0xdee89a77, 0x1534c643, 0x457b7b8b, 0xaf172f7a, 0x6b9b09d6, 0x33573f7f, 0xf14e15c4, - 0x526467d5, 0xaf488241, 0x87c3ee0d, 0x33be490c, 0x95aa6e52, 0x43ec242e, 0xd77de99b, 0xd018334f, - 0x5b78d407, 0x498eb66b, 0xb1279fa8, 0xb38b0ea6, 0x90718376, 0xe325dee2, 0x8e2f2cba, 0xcaa5bdec, - 0x9d652c56, 0xad68f5cb, 0xa77591af, 0x88e37ee8, 0xf8faa221, 0xfcbbbe47, 0x4f407786, 0xaf393889, - 0xf444a1d9, 0x15ae1a2f, 0x40aa7097, 0x6f9486ac, 0x29d232a3, 0xe47609e9, 0xe8b631ff, 0xba8565f4, - 0x11288749, 0x46c9a838, 0xeb1b7cd8, 0xf516bbb1, 0xfb74fda0, 0x010996e6, 0x4c994653, 0x1d889512, - 0x53dcd9a3, 0xdd074697, 0x1e78e17c, 0x637c98bf, 0x930bb219, 0xcf7f75b0, 0xcb9355fb, 0x9e623009, - 0xe466d82c, 0x28f968d3, 0xfeb385d9, 0x238e026c, 0xb8ed0560, 0x0c6a027a, 0x3d6fec4b, 0xbb4b2ec2, - 0xe715031c, 0xeded011d, 0xcdc4d3b9, 0xc456fc96, 0xdd0eea20, 0xb3df8ec9, 0x12351993, 0xd9cbb01c, - 0x603147a2, 0xcf37d17d, 0xf7fcd9dc, 0xd8556fa3, 0x104c8131, 0x13152774, 0xb4715811, 0x6a72c2c9, - 0xc5ae37bb, 0xa76ce12a, 0x8150d8f3, 0x2ec29218, 0xa35f0984, 0x48c0647e, 0x0b5ff98c, 0x71893f7b -] # This seems to be the most reliable way to inline this code, using a C preprocessor macro: cdef extern from *: """ - #define BARREL_SHIFT(v, shift) (((v) << (shift)) | ((v) >> (((32 - (shift)) & 0x1f)))) + #define BARREL_SHIFT64(v, shift) (((v) << (shift)) | ((v) >> (((64 - (shift)) & 0x3f)))) """ - uint32_t BARREL_SHIFT(uint32_t v, uint32_t shift) + uint64_t BARREL_SHIFT64(uint64_t v, uint64_t shift) @cython.boundscheck(False) # Deactivate bounds checking @cython.wraparound(False) # Deactivate negative indexing. -cdef uint32_t* buzhash_init_table(uint32_t seed): +cdef uint64_t* buzhash64_init_table(uint64_t seed): """Initialize the buzhash table with the given seed.""" cdef int i - cdef uint32_t* table = malloc(1024) # 256 * sizeof(uint32_t) + cdef uint64_t* table = malloc(2048) # 256 * sizeof(uint64_t) for i in range(256): - table[i] = table_base[i] ^ seed + # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the seed: + v = f"{i:02x}{seed:016x}".encode() + d64 = sha256(v).digest()[:8] + table[i] = int.from_bytes(d64, byteorder='little') return table @cython.boundscheck(False) # Deactivate bounds checking @cython.wraparound(False) # Deactivate negative indexing. @cython.cdivision(True) # Use C division/modulo semantics for integer division. -cdef uint32_t _buzhash(const unsigned char* data, size_t len, const uint32_t* h): +cdef uint64_t _buzhash64(const unsigned char* data, size_t len, const uint64_t* h): """Calculate the buzhash of the given data.""" - cdef uint32_t i - cdef uint32_t sum = 0, imod + cdef uint64_t i + cdef uint64_t sum = 0, imod for i in range(len - 1, 0, -1): - imod = i & 0x1f - sum ^= BARREL_SHIFT(h[data[0]], imod) + imod = i & 0x3f + sum ^= BARREL_SHIFT64(h[data[0]], imod) data += 1 return sum ^ h[data[0]] @@ -109,13 +68,13 @@ cdef uint32_t _buzhash(const unsigned char* data, size_t len, const uint32_t* h) @cython.boundscheck(False) # Deactivate bounds checking @cython.wraparound(False) # Deactivate negative indexing. @cython.cdivision(True) # Use C division/modulo semantics for integer division. -cdef uint32_t _buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t* h): +cdef uint64_t _buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, const uint64_t* h): """Update the buzhash with a new byte.""" - cdef uint32_t lenmod = len & 0x1f - return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], lenmod) ^ h[add] + cdef uint64_t lenmod = len & 0x3f + return BARREL_SHIFT64(sum, 1) ^ BARREL_SHIFT64(h[remove], lenmod) ^ h[add] -cdef class Chunker: +cdef class ChunkerBuzHash64: """ Content-Defined Chunker, variable chunk sizes. @@ -127,8 +86,8 @@ cdef class Chunker: Additionally it obeys some more criteria, like a minimum and maximum chunk size. It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks. """ - cdef uint32_t chunk_mask - cdef uint32_t* table + cdef uint64_t chunk_mask + cdef uint64_t* table cdef uint8_t* data cdef object _fd # Python object for file descriptor cdef int fh @@ -150,7 +109,7 @@ cdef class Chunker: self.window_size = hash_window_size self.chunk_mask = (1 << hash_mask_bits) - 1 self.min_size = min_size - self.table = buzhash_init_table(seed & 0xffffffff) + self.table = buzhash64_init_table(seed & 0xffffffffffffffff) self.buf_size = max_size self.data = malloc(self.buf_size) self.fh = -1 @@ -211,7 +170,7 @@ cdef class Chunker: cdef object process(self) except *: """Process the chunker's buffer and return the next chunk.""" - cdef uint32_t sum, chunk_mask = self.chunk_mask + cdef uint64_t sum, chunk_mask = self.chunk_mask cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size cdef uint8_t* p cdef uint8_t* stop_at @@ -245,14 +204,14 @@ cdef class Chunker: # window starts at the potential cutting place. self.position += min_size self.remaining -= min_size - sum = _buzhash(self.data + self.position, window_size, self.table) + sum = _buzhash64(self.data + self.position, window_size, self.table) while self.remaining > self.window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size): p = self.data + self.position stop_at = p + self.remaining - window_size while p < stop_at and (sum & chunk_mask): - sum = _buzhash_update(sum, p[0], p[window_size], window_size, self.table) + sum = _buzhash64_update(sum, p[0], p[window_size], window_size, self.table) p += 1 did_bytes = p - (self.data + self.position) @@ -315,18 +274,18 @@ cdef class Chunker: return Chunk(data, size=got, allocation=allocation) -def buzhash(data, unsigned long seed): - cdef uint32_t *table - cdef uint32_t sum - table = buzhash_init_table(seed & 0xffffffff) - sum = _buzhash( data, len(data), table) +def buzhash64(data, unsigned long seed): + cdef uint64_t *table + cdef uint64_t sum + table = buzhash64_init_table(seed & 0xffffffffffffffff) + sum = _buzhash64( data, len(data), table) free(table) return sum -def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed): - cdef uint32_t *table - table = buzhash_init_table(seed & 0xffffffff) - sum = _buzhash_update(sum, remove, add, len, table) +def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed): + cdef uint64_t *table + table = buzhash64_init_table(seed & 0xffffffffffffffff) + sum = _buzhash64_update(sum, remove, add, len, table) free(table) return sum diff --git a/src/borg/constants.py b/src/borg/constants.py index 911a8f1be..8bad4b6b4 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -92,6 +92,7 @@ MAX_SEGMENT_DIR_INDEX = 2**32 - 1 # chunker algorithms CH_BUZHASH = "buzhash" +CH_BUZHASH64 = "buzhash64" CH_FIXED = "fixed" CH_FAIL = "fail" @@ -103,6 +104,7 @@ HASH_MASK_BITS = 21 # results in ~2MiB chunks statistically # defaults, use --chunker-params to override CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) +CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) # chunker params for the items metadata stream, finer granularity ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE) diff --git a/src/borg/testsuite/chunkers/buzhash64_self_test.py b/src/borg/testsuite/chunkers/buzhash64_self_test.py index 1c6337047..60189b4b2 100644 --- a/src/borg/testsuite/chunkers/buzhash64_self_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_self_test.py @@ -4,64 +4,67 @@ from io import BytesIO from ...chunkers import get_chunker -from ...chunkers.buzhash import buzhash, buzhash_update, Chunker +from ...chunkers.buzhash64 import buzhash64, buzhash64_update, ChunkerBuzHash64 from ...constants import * # NOQA from .. import BaseTestCase from . import cf -class ChunkerTestCase(BaseTestCase): - def test_chunkify(self): +class ChunkerBuzHash64TestCase(BaseTestCase): + def test_chunkify64(self): data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y" - parts = cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))) + parts = cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))) self.assert_equal(len(parts), 2) self.assert_equal(b"".join(parts), data) - self.assert_equal(cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), []) + self.assert_equal(cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), []) self.assert_equal( - cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"], + cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"fo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"obaz"], ) self.assert_equal( - cf(Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"fo", b"obarb", b"oob", b"azf", b"oobarb", b"oob", b"azf", b"oobarb", b"oobaz"], + cf(ChunkerBuzHash64(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarboobazfoobarboobazfoobarboobaz"], ) self.assert_equal( - cf(Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foob", b"ar", b"boobazfoob", b"ar", b"boobazfoob", b"ar", b"boobaz"], + cf(ChunkerBuzHash64(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"], ) self.assert_equal( - cf(Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), [b"foobarboobaz" * 3] + cf(ChunkerBuzHash64(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobar", b"boobazfoo", b"barboobazfoo", b"barboobaz"], ) self.assert_equal( - cf(Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobar", b"boobazfo", b"obar", b"boobazfo", b"obar", b"boobaz"], + cf(ChunkerBuzHash64(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"], ) self.assert_equal( - cf(Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foob", b"arboobaz", b"foob", b"arboobaz", b"foob", b"arboobaz"], + cf(ChunkerBuzHash64(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"], ) self.assert_equal( - cf(Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), [b"foobarboobaz" * 3] + cf(ChunkerBuzHash64(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarboobazfoo", b"barboobazfoo", b"barboobaz"], ) self.assert_equal( - cf(Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarbo", b"obazfoobar", b"boobazfo", b"obarboobaz"], + cf(ChunkerBuzHash64(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"], ) self.assert_equal( - cf(Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"], + cf(ChunkerBuzHash64(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"], ) - def test_buzhash(self): - self.assert_equal(buzhash(b"abcdefghijklmnop", 0), 3795437769) - self.assert_equal(buzhash(b"abcdefghijklmnop", 1), 3795400502) - self.assert_equal( - buzhash(b"abcdefghijklmnop", 1), buzhash_update(buzhash(b"Xabcdefghijklmno", 1), ord("X"), ord("p"), 16, 1) - ) - # Test with more than 31 bytes to make sure our barrel_shift macro works correctly - self.assert_equal(buzhash(b"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz", 0), 566521248) + def test_buzhash64(self): + self.assert_equal(buzhash64(b"abcdefghijklmnop", 0), 13314711829666336849) + self.assert_equal(buzhash64(b"abcdefghijklmnop", 1), 17807676237451361719) + expected = buzhash64(b"abcdefghijklmnop", 1) + previous = buzhash64(b"Xabcdefghijklmno", 1) + this = buzhash64_update(previous, ord("X"), ord("p"), 16, 1) + self.assert_equal(this, expected) + # Test with more than 63 bytes to make sure our barrel_shift macro works correctly + self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, 0), 592868834756664313) - def test_small_reads(self): + def test_small_reads64(self): class SmallReadFile: input = b"a" * (20 + 1) @@ -69,6 +72,6 @@ class ChunkerTestCase(BaseTestCase): self.input = self.input[:-1] return self.input[:1] - chunker = get_chunker(*CHUNKER_PARAMS, seed=0, sparse=False) + chunker = get_chunker(*CHUNKER64_PARAMS, seed=0, sparse=False) reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile()))) assert reconstructed == b"a" * 20 diff --git a/src/borg/testsuite/chunkers/buzhash64_test.py b/src/borg/testsuite/chunkers/buzhash64_test.py index 3b33a1cc9..fef302838 100644 --- a/src/borg/testsuite/chunkers/buzhash64_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_test.py @@ -3,7 +3,7 @@ from io import BytesIO import os from . import cf -from ...chunkers import Chunker +from ...chunkers import ChunkerBuzHash64 from ...constants import * # NOQA from ...helpers import hex_to_bin @@ -12,7 +12,7 @@ def H(data): return sha256(data).digest() -def test_chunkpoints_unchanged(): +def test_chunkpoints64_unchanged(): def twist(size): x = 1 a = bytearray(size) @@ -32,20 +32,20 @@ def test_chunkpoints_unchanged(): for maskbits in (4, 7, 10, 12): for seed in (1849058162, 1234567653): fh = BytesIO(data) - chunker = Chunker(seed, minexp, maxexp, maskbits, winsize) + chunker = ChunkerBuzHash64(seed, minexp, maxexp, maskbits, winsize) chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))] runs.append(H(b"".join(chunks))) # The "correct" hash below matches the existing chunker behavior. # Future chunker optimisations must not change this, or existing repos will bloat. overall_hash = H(b"".join(runs)) - assert overall_hash == hex_to_bin("a43d0ecb3ae24f38852fcc433a83dacd28fe0748d09cc73fc11b69cf3f1a7299") + assert overall_hash == hex_to_bin("fa9002758c0358721404f55f3020bb56b987cb3cd9a688ff9641f4023215f4e7") -def test_buzhash_chunksize_distribution(): +def test_buzhash64_chunksize_distribution(): data = os.urandom(1048576) min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB - chunker = Chunker(0, min_exp, max_exp, mask, 4095) + chunker = ChunkerBuzHash64(0, min_exp, max_exp, mask, 4095) f = BytesIO(data) chunks = cf(chunker.chunkify(f)) del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp From 63ff136dfe31c585d186ca4f1cb6c59b275ce101 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 5 Jun 2025 19:23:10 +0200 Subject: [PATCH 3/8] buzhash64: integrate into build --- .gitignore | 1 + scripts/make.py | 1 + setup.py | 3 +++ src/borg/chunkers/__init__.py | 5 +++++ 4 files changed, 10 insertions(+) diff --git a/.gitignore b/.gitignore index 028febb19..13717d20a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ src/borg/compress.c src/borg/crypto/low_level.c src/borg/item.c src/borg/chunkers/buzhash.c +src/borg/chunkers/buzhash64.c src/borg/chunkers/reader.c src/borg/checksums.c src/borg/platform/darwin.c diff --git a/scripts/make.py b/scripts/make.py index 05b4072de..0a64493ca 100644 --- a/scripts/make.py +++ b/scripts/make.py @@ -543,6 +543,7 @@ cython_sources = """ src/borg/compress.pyx src/borg/crypto/low_level.pyx src/borg/chunkers/buzhash.pyx +src/borg/chunkers/buzhash64.pyx src/borg/chunkers/reader.pyx src/borg/hashindex.pyx src/borg/item.pyx diff --git a/setup.py b/setup.py index 19f403583..859d34690 100644 --- a/setup.py +++ b/setup.py @@ -51,6 +51,7 @@ cflags = ["-Wall", "-Wextra", "-Wpointer-arith", "-Wno-unreachable-code-fallthro compress_source = "src/borg/compress.pyx" crypto_ll_source = "src/borg/crypto/low_level.pyx" buzhash_source = "src/borg/chunkers/buzhash.pyx" +buzhash64_source = "src/borg/chunkers/buzhash64.pyx" reader_source = "src/borg/chunkers/reader.pyx" hashindex_source = "src/borg/hashindex.pyx" item_source = "src/borg/item.pyx" @@ -66,6 +67,7 @@ cython_sources = [ compress_source, crypto_ll_source, buzhash_source, + buzhash64_source, reader_source, hashindex_source, item_source, @@ -185,6 +187,7 @@ if not on_rtd: Extension("borg.hashindex", [hashindex_source], extra_compile_args=cflags), Extension("borg.item", [item_source], extra_compile_args=cflags), Extension("borg.chunkers.buzhash", [buzhash_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]), + Extension("borg.chunkers.buzhash64", [buzhash64_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]), Extension("borg.chunkers.reader", [reader_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]), Extension("borg.checksums", **checksums_ext_kwargs), ] diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py index 7f7833b8c..5f3ded4fc 100644 --- a/src/borg/chunkers/__init__.py +++ b/src/borg/chunkers/__init__.py @@ -1,4 +1,5 @@ from .buzhash import Chunker +from .buzhash64 import ChunkerBuzHash64 from .failing import ChunkerFailing from .fixed import ChunkerFixed from .reader import * # noqa @@ -11,6 +12,10 @@ def get_chunker(algo, *params, **kw): seed = kw["seed"] sparse = kw["sparse"] return Chunker(seed, *params, sparse=sparse) + if algo == "buzhash64": + seed = kw["seed"] + sparse = kw["sparse"] + return ChunkerBuzHash64(seed, *params, sparse=sparse) if algo == "fixed": sparse = kw["sparse"] return ChunkerFixed(*params, sparse=sparse) From dc2dab15358dbf0a5e7fab879478c8fe71856fdd Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 5 Jun 2025 19:32:27 +0200 Subject: [PATCH 4/8] buzhash64: integrate into borg benchmark command --- src/borg/archiver/benchmark_cmd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/borg/archiver/benchmark_cmd.py b/src/borg/archiver/benchmark_cmd.py index e69aa2c18..175b940d6 100644 --- a/src/borg/archiver/benchmark_cmd.py +++ b/src/borg/archiver/benchmark_cmd.py @@ -147,6 +147,7 @@ class BenchmarkMixIn: for spec, func in [ ("buzhash,19,23,21,4095", lambda: chunkit("buzhash", 19, 23, 21, 4095, seed=0, sparse=False)), + ("buzhash64,19,23,21,4095", lambda: chunkit("buzhash64", 19, 23, 21, 4095, seed=0, sparse=False)), ("fixed,1048576", lambda: chunkit("fixed", 1048576, sparse=False)), ]: print(f"{spec:<24} {size:<10} {timeit(func, number=100):.3f}s") From 6f55cba0ceba40636acc44d4b2845f4448684b79 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 5 Jun 2025 22:25:22 +0200 Subject: [PATCH 5/8] ChunkerParams: add support for buzhash64 --- src/borg/helpers/parseformat.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index cb481685a..c98d16f79 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -187,6 +187,21 @@ def ChunkerParams(s): return algo, block_size, header_size if algo == "default" and count == 1: # default return CHUNKER_PARAMS + if algo == CH_BUZHASH64 and count == 5: # buzhash64, chunk_min, chunk_max, chunk_mask, window_size + chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[1:]) + if not (chunk_min <= chunk_mask <= chunk_max): + raise argparse.ArgumentTypeError("required: chunk_min <= chunk_mask <= chunk_max") + if chunk_min < 6: + # see comment in 'fixed' algo check + raise argparse.ArgumentTypeError( + "min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)" + ) + if chunk_max > 23: + raise argparse.ArgumentTypeError( + "max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)" + ) + # note that for buzhash64, there is no problem with even window_size. + return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash): if algo == CH_BUZHASH and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4 :]) From 544b3f41a90eea4200cfea13ab83b0ab7adb2815 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 6 Jun 2025 01:52:29 +0200 Subject: [PATCH 6/8] get_chunker: give it the key instead of the seed the buzhash seed only has 32bits, but we rather want 64bits for buzhash64. just take them from crypt_key for now. --- src/borg/archive.py | 8 ++++---- src/borg/archiver/benchmark_cmd.py | 4 ++-- src/borg/archiver/transfer_cmd.py | 2 +- src/borg/chunkers/__init__.py | 17 +++++++++++------ .../testsuite/chunkers/buzhash64_self_test.py | 2 +- .../testsuite/chunkers/buzhash_self_test.py | 2 +- 6 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 462da3136..5bf8faaec 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -351,7 +351,7 @@ class ChunkBuffer: self.packer = msgpack.Packer() self.chunks = [] self.key = key - self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed, sparse=False) + self.chunker = get_chunker(*chunker_params, key=self.key, sparse=False) self.saved_chunks_len = None def add(self, item): @@ -1227,7 +1227,7 @@ class FilesystemObjectProcessors: self.hlm = HardLinkManager(id_type=tuple, info_type=(list, type(None))) # (dev, ino) -> chunks or None self.stats = Statistics(output_json=log_json, iec=iec) # threading: done by cache (including progress) self.cwd = os.getcwd() - self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse) + self.chunker = get_chunker(*chunker_params, key=key, sparse=sparse) @contextmanager def create_helper(self, path, st, status=None, hardlinkable=True, strip_prefix=None): @@ -1502,7 +1502,7 @@ class TarfileObjectProcessors: self.print_file_status = file_status_printer or (lambda *args: None) self.stats = Statistics(output_json=log_json, iec=iec) # threading: done by cache (including progress) - self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=False) + self.chunker = get_chunker(*chunker_params, key=key, sparse=False) self.hlm = HardLinkManager(id_type=str, info_type=list) # path -> chunks @contextmanager @@ -2325,7 +2325,7 @@ class ArchiveRecreater: target.process_file_chunks = ChunksProcessor( cache=self.cache, key=self.key, add_item=target.add_item, rechunkify=target.recreate_rechunkify ).process_file_chunks - target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed, sparse=False) + target.chunker = get_chunker(*target.chunker_params, key=self.key, sparse=False) return target def create_target_archive(self, name): diff --git a/src/borg/archiver/benchmark_cmd.py b/src/borg/archiver/benchmark_cmd.py index 175b940d6..2818435f1 100644 --- a/src/borg/archiver/benchmark_cmd.py +++ b/src/borg/archiver/benchmark_cmd.py @@ -146,8 +146,8 @@ class BenchmarkMixIn: pass for spec, func in [ - ("buzhash,19,23,21,4095", lambda: chunkit("buzhash", 19, 23, 21, 4095, seed=0, sparse=False)), - ("buzhash64,19,23,21,4095", lambda: chunkit("buzhash64", 19, 23, 21, 4095, seed=0, sparse=False)), + ("buzhash,19,23,21,4095", lambda: chunkit("buzhash", 19, 23, 21, 4095, sparse=False)), + ("buzhash64,19,23,21,4095", lambda: chunkit("buzhash64", 19, 23, 21, 4095, sparse=False)), ("fixed,1048576", lambda: chunkit("fixed", 1048576, sparse=False)), ]: print(f"{spec:<24} {size:<10} {timeit(func, number=100):.3f}s") diff --git a/src/borg/archiver/transfer_cmd.py b/src/borg/archiver/transfer_cmd.py index 617c8abec..4ada0b848 100644 --- a/src/borg/archiver/transfer_cmd.py +++ b/src/borg/archiver/transfer_cmd.py @@ -41,7 +41,7 @@ def transfer_chunks( file = ChunkIteratorFileWrapper(chunk_iterator) # Create a chunker with the specified parameters - chunker = get_chunker(*chunker_params, seed=archive.key.chunk_seed, sparse=False) + chunker = get_chunker(*chunker_params, key=archive.key, sparse=False) for chunk in chunker.chunkify(file): if not dry_run: chunk_id, data = cached_hash(chunk, archive.key.id_hash) diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py index 5f3ded4fc..c3c625760 100644 --- a/src/borg/chunkers/__init__.py +++ b/src/borg/chunkers/__init__.py @@ -3,21 +3,26 @@ from .buzhash64 import ChunkerBuzHash64 from .failing import ChunkerFailing from .fixed import ChunkerFixed from .reader import * # noqa +from ..crypto.key import PlaintextKey API_VERSION = "1.2_01" def get_chunker(algo, *params, **kw): + key = kw.get("key", None) + sparse = kw.get("sparse", False) + # key.chunk_seed only has 32bits + seed = key.chunk_seed if key is not None else 0 + # we want 64bits for buzhash64, get them from crypt_key + if key is None or isinstance(key, PlaintextKey): + seed64 = 0 + else: + seed64 = int.from_bytes(key.crypt_key[:8], byteorder="little") if algo == "buzhash": - seed = kw["seed"] - sparse = kw["sparse"] return Chunker(seed, *params, sparse=sparse) if algo == "buzhash64": - seed = kw["seed"] - sparse = kw["sparse"] - return ChunkerBuzHash64(seed, *params, sparse=sparse) + return ChunkerBuzHash64(seed64, *params, sparse=sparse) if algo == "fixed": - sparse = kw["sparse"] return ChunkerFixed(*params, sparse=sparse) if algo == "fail": return ChunkerFailing(*params) diff --git a/src/borg/testsuite/chunkers/buzhash64_self_test.py b/src/borg/testsuite/chunkers/buzhash64_self_test.py index 60189b4b2..a356afbbf 100644 --- a/src/borg/testsuite/chunkers/buzhash64_self_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_self_test.py @@ -72,6 +72,6 @@ class ChunkerBuzHash64TestCase(BaseTestCase): self.input = self.input[:-1] return self.input[:1] - chunker = get_chunker(*CHUNKER64_PARAMS, seed=0, sparse=False) + chunker = get_chunker(*CHUNKER64_PARAMS, sparse=False) reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile()))) assert reconstructed == b"a" * 20 diff --git a/src/borg/testsuite/chunkers/buzhash_self_test.py b/src/borg/testsuite/chunkers/buzhash_self_test.py index 1c6337047..9baf862f3 100644 --- a/src/borg/testsuite/chunkers/buzhash_self_test.py +++ b/src/borg/testsuite/chunkers/buzhash_self_test.py @@ -69,6 +69,6 @@ class ChunkerTestCase(BaseTestCase): self.input = self.input[:-1] return self.input[:1] - chunker = get_chunker(*CHUNKER_PARAMS, seed=0, sparse=False) + chunker = get_chunker(*CHUNKER_PARAMS, sparse=False) reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile()))) assert reconstructed == b"a" * 20 From b9646f236ebd1a16cb49a8e8adaf550eea2391ac Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 6 Jun 2025 22:39:25 +0200 Subject: [PATCH 7/8] buzhash64: init table using a 256bit key derived from ID key That way we can feed lots of entropy into the table creation. The bh64_key is derived from the id_key (NOT the crypt_key), thus it will create the same key for related repositories (even if they use different encryption/authentication keys). Due to that, it will also create the same buzhash64 table, will cut chunks at the same points and deduplication will work amongst the related repositories. --- src/borg/chunkers/__init__.py | 10 ++-- src/borg/chunkers/buzhash64.pyi | 6 +-- src/borg/chunkers/buzhash64.pyx | 20 +++---- .../testsuite/chunkers/buzhash64_self_test.py | 52 +++++++++---------- src/borg/testsuite/chunkers/buzhash64_test.py | 8 +-- 5 files changed, 46 insertions(+), 50 deletions(-) diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py index c3c625760..463be44c8 100644 --- a/src/borg/chunkers/__init__.py +++ b/src/borg/chunkers/__init__.py @@ -3,7 +3,6 @@ from .buzhash64 import ChunkerBuzHash64 from .failing import ChunkerFailing from .fixed import ChunkerFixed from .reader import * # noqa -from ..crypto.key import PlaintextKey API_VERSION = "1.2_01" @@ -13,15 +12,12 @@ def get_chunker(algo, *params, **kw): sparse = kw.get("sparse", False) # key.chunk_seed only has 32bits seed = key.chunk_seed if key is not None else 0 - # we want 64bits for buzhash64, get them from crypt_key - if key is None or isinstance(key, PlaintextKey): - seed64 = 0 - else: - seed64 = int.from_bytes(key.crypt_key[:8], byteorder="little") + # for buzhash64, we want a much longer key, so we derive it from the id key + bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"" if algo == "buzhash": return Chunker(seed, *params, sparse=sparse) if algo == "buzhash64": - return ChunkerBuzHash64(seed64, *params, sparse=sparse) + return ChunkerBuzHash64(bh64_key, *params, sparse=sparse) if algo == "fixed": return ChunkerFixed(*params, sparse=sparse) if algo == "fail": diff --git a/src/borg/chunkers/buzhash64.pyi b/src/borg/chunkers/buzhash64.pyi index 4ca5359d3..3414bd609 100644 --- a/src/borg/chunkers/buzhash64.pyi +++ b/src/borg/chunkers/buzhash64.pyi @@ -4,13 +4,13 @@ from .reader import fmap_entry API_VERSION: str -def buzhash64(data: bytes, seed: int) -> int: ... -def buzhash64_update(sum: int, remove: int, add: int, len: int, seed: int) -> int: ... +def buzhash64(data: bytes, key: bytes) -> int: ... +def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ... class ChunkerBuzHash64: def __init__( self, - seed: int, + key: bytes, chunk_min_exp: int, chunk_max_exp: int, hash_mask_bits: int, diff --git a/src/borg/chunkers/buzhash64.pyx b/src/borg/chunkers/buzhash64.pyx index db264d74a..0199406fe 100644 --- a/src/borg/chunkers/buzhash64.pyx +++ b/src/borg/chunkers/buzhash64.pyx @@ -39,13 +39,13 @@ cdef extern from *: @cython.boundscheck(False) # Deactivate bounds checking @cython.wraparound(False) # Deactivate negative indexing. -cdef uint64_t* buzhash64_init_table(uint64_t seed): - """Initialize the buzhash table with the given seed.""" +cdef uint64_t* buzhash64_init_table(bytes key): + """Initialize the buzhash table using the given key.""" cdef int i cdef uint64_t* table = malloc(2048) # 256 * sizeof(uint64_t) for i in range(256): - # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the seed: - v = f"{i:02x}{seed:016x}".encode() + # deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key: + v = f"{i:02x}".encode() + key d64 = sha256(v).digest()[:8] table[i] = int.from_bytes(d64, byteorder='little') return table @@ -99,7 +99,7 @@ cdef class ChunkerBuzHash64: cdef size_t reader_block_size cdef bint sparse - def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False): + def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False): min_size = 1 << chunk_min_exp max_size = 1 << chunk_max_exp assert max_size <= len(zeros) @@ -109,7 +109,7 @@ cdef class ChunkerBuzHash64: self.window_size = hash_window_size self.chunk_mask = (1 << hash_mask_bits) - 1 self.min_size = min_size - self.table = buzhash64_init_table(seed & 0xffffffffffffffff) + self.table = buzhash64_init_table(key) self.buf_size = max_size self.data = malloc(self.buf_size) self.fh = -1 @@ -274,18 +274,18 @@ cdef class ChunkerBuzHash64: return Chunk(data, size=got, allocation=allocation) -def buzhash64(data, unsigned long seed): +def buzhash64(data, bytes key): cdef uint64_t *table cdef uint64_t sum - table = buzhash64_init_table(seed & 0xffffffffffffffff) + table = buzhash64_init_table(key) sum = _buzhash64( data, len(data), table) free(table) return sum -def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed): +def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, bytes key): cdef uint64_t *table - table = buzhash64_init_table(seed & 0xffffffffffffffff) + table = buzhash64_init_table(key) sum = _buzhash64_update(sum, remove, add, len, table) free(table) return sum diff --git a/src/borg/testsuite/chunkers/buzhash64_self_test.py b/src/borg/testsuite/chunkers/buzhash64_self_test.py index a356afbbf..41198477d 100644 --- a/src/borg/testsuite/chunkers/buzhash64_self_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_self_test.py @@ -13,56 +13,56 @@ from . import cf class ChunkerBuzHash64TestCase(BaseTestCase): def test_chunkify64(self): data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y" - parts = cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))) + parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))) self.assert_equal(len(parts), 2) self.assert_equal(b"".join(parts), data) - self.assert_equal(cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), []) + self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), []) self.assert_equal( - cf(ChunkerBuzHash64(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"fo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"ob", b"azfo", b"oba", b"rbo", b"obaz"], + cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarboobazfoobarboobazfoobarboobaz"], + cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"], + cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobar", b"boobazfoo", b"barboobazfoo", b"barboobaz"], + cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"], - ) - self.assert_equal( - cf(ChunkerBuzHash64(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), [b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarboobazfoo", b"barboobazfoo", b"barboobaz"], + cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), - [b"foobarbooba", b"zfoobarbooba", b"zfoobarboobaz"], + cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"], ) self.assert_equal( - cf(ChunkerBuzHash64(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), [b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"], ) + self.assert_equal( + cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))), + [b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"], + ) def test_buzhash64(self): - self.assert_equal(buzhash64(b"abcdefghijklmnop", 0), 13314711829666336849) - self.assert_equal(buzhash64(b"abcdefghijklmnop", 1), 17807676237451361719) - expected = buzhash64(b"abcdefghijklmnop", 1) - previous = buzhash64(b"Xabcdefghijklmno", 1) - this = buzhash64_update(previous, ord("X"), ord("p"), 16, 1) + self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478) + self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910) + expected = buzhash64(b"abcdefghijklmnop", b"1") + previous = buzhash64(b"Xabcdefghijklmno", b"1") + this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1") self.assert_equal(this, expected) # Test with more than 63 bytes to make sure our barrel_shift macro works correctly - self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, 0), 592868834756664313) + self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899) def test_small_reads64(self): class SmallReadFile: diff --git a/src/borg/testsuite/chunkers/buzhash64_test.py b/src/borg/testsuite/chunkers/buzhash64_test.py index fef302838..7a0019732 100644 --- a/src/borg/testsuite/chunkers/buzhash64_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_test.py @@ -30,22 +30,22 @@ def test_chunkpoints64_unchanged(): if minexp >= maxexp: continue for maskbits in (4, 7, 10, 12): - for seed in (1849058162, 1234567653): + for key in (b"first_key", b"second_key"): fh = BytesIO(data) - chunker = ChunkerBuzHash64(seed, minexp, maxexp, maskbits, winsize) + chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize) chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))] runs.append(H(b"".join(chunks))) # The "correct" hash below matches the existing chunker behavior. # Future chunker optimisations must not change this, or existing repos will bloat. overall_hash = H(b"".join(runs)) - assert overall_hash == hex_to_bin("fa9002758c0358721404f55f3020bb56b987cb3cd9a688ff9641f4023215f4e7") + assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc") def test_buzhash64_chunksize_distribution(): data = os.urandom(1048576) min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB - chunker = ChunkerBuzHash64(0, min_exp, max_exp, mask, 4095) + chunker = ChunkerBuzHash64(b"", min_exp, max_exp, mask, 4095) f = BytesIO(data) chunks = cf(chunker.chunkify(f)) del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp From d23704e112a2e47621a9c648b73c8d15977790dd Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 6 Jun 2025 11:56:49 +0200 Subject: [PATCH 8/8] buzhash64: docs --- docs/internals.rst | 4 ++-- docs/internals/data-structures.rst | 11 +++++++++++ docs/internals/security.rst | 14 ++++++++++---- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/docs/internals.rst b/docs/internals.rst index e587803cb..3c6645c19 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -19,8 +19,8 @@ specified when the backup was performed. Deduplication is performed globally across all data in the repository (multiple backups and even multiple hosts), both on data and file metadata, using :ref:`chunks` created by the chunker using the -Buzhash_ algorithm ("buzhash" chunker) or a simpler fixed blocksize -algorithm ("fixed" chunker). +Buzhash_ algorithm ("buzhash" and "buzhash64" chunker) or a simpler +fixed blocksize algorithm ("fixed" chunker). To perform the repository-wide deduplication, a hash of each chunk is checked against the :ref:`chunks cache `, which is a diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst index ff1136a60..b7ffccc36 100644 --- a/docs/internals/data-structures.rst +++ b/docs/internals/data-structures.rst @@ -399,6 +399,7 @@ Borg has these chunkers: supporting a header block of different size. - "buzhash": variable, content-defined blocksize, uses a rolling hash computed by the Buzhash_ algorithm. +- "buzhash64": similar to "buzhash", but improved 64bit implementation For some more general usage hints see also ``--chunker-params``. @@ -469,6 +470,16 @@ for the repository, and stored encrypted in the keyfile. This is to prevent chunk size based fingerprinting attacks on your encrypted repo contents (to guess what files you have based on a specific set of chunk sizes). +"buzhash64" chunker ++++++++++++++++++++ + +Similar to "buzhash", but using 64bit wide hash values. + +The buzhash table is cryptographically derived from secret key material. + +These changes should improve resistance against attacks and also solve +some of the issues of the original (32bit / XORed table) implementation. + .. _cache: The cache diff --git a/docs/internals/security.rst b/docs/internals/security.rst index 40b27d797..bcddbb2e8 100644 --- a/docs/internals/security.rst +++ b/docs/internals/security.rst @@ -361,13 +361,19 @@ The chunks stored in the repo are the (compressed, encrypted and authenticated) output of the chunker. The sizes of these stored chunks are influenced by the compression, encryption and authentication. -buzhash chunker -~~~~~~~~~~~~~~~ +buzhash and buzhash64 chunker +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The buzhash chunker chunks according to the input data, the chunker's -parameters and the secret chunker seed (which all influence the chunk boundary +The buzhash chunkers chunk according to the input data, the chunker's +parameters and secret key material (which all influence the chunk boundary positions). +Secret key material: + +- "buzhash": chunker seed (32bits), used for XORing the hardcoded buzhash table +- "buzhash64": bh64_key (256bits) is derived from ID key, used to cryptographically + generate the table. + Small files below some specific threshold (default: 512 KiB) result in only one chunk (identical content / size as the original file), bigger files result in multiple chunks.