mirror of
https://github.com/borgbackup/borg.git
synced 2026-06-11 01:41:57 -04:00
Merge pull request #8903 from ThomasWaldmann/buzhash64
buzhash64 chunker
This commit is contained in:
commit
9a65d5245d
17 changed files with 519 additions and 16 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -7,6 +7,7 @@ src/borg/compress.c
|
|||
src/borg/crypto/low_level.c
|
||||
src/borg/item.c
|
||||
src/borg/chunkers/buzhash.c
|
||||
src/borg/chunkers/buzhash64.c
|
||||
src/borg/chunkers/reader.c
|
||||
src/borg/checksums.c
|
||||
src/borg/platform/darwin.c
|
||||
|
|
|
|||
|
|
@ -19,8 +19,8 @@ specified when the backup was performed.
|
|||
Deduplication is performed globally across all data in the repository
|
||||
(multiple backups and even multiple hosts), both on data and file
|
||||
metadata, using :ref:`chunks` created by the chunker using the
|
||||
Buzhash_ algorithm ("buzhash" chunker) or a simpler fixed blocksize
|
||||
algorithm ("fixed" chunker).
|
||||
Buzhash_ algorithm ("buzhash" and "buzhash64" chunker) or a simpler
|
||||
fixed blocksize algorithm ("fixed" chunker).
|
||||
|
||||
To perform the repository-wide deduplication, a hash of each
|
||||
chunk is checked against the :ref:`chunks cache <cache>`, which is a
|
||||
|
|
|
|||
|
|
@ -399,6 +399,7 @@ Borg has these chunkers:
|
|||
supporting a header block of different size.
|
||||
- "buzhash": variable, content-defined blocksize, uses a rolling hash
|
||||
computed by the Buzhash_ algorithm.
|
||||
- "buzhash64": similar to "buzhash", but improved 64bit implementation
|
||||
|
||||
For some more general usage hints see also ``--chunker-params``.
|
||||
|
||||
|
|
@ -469,6 +470,16 @@ for the repository, and stored encrypted in the keyfile. This is to prevent
|
|||
chunk size based fingerprinting attacks on your encrypted repo contents (to
|
||||
guess what files you have based on a specific set of chunk sizes).
|
||||
|
||||
"buzhash64" chunker
|
||||
+++++++++++++++++++
|
||||
|
||||
Similar to "buzhash", but using 64bit wide hash values.
|
||||
|
||||
The buzhash table is cryptographically derived from secret key material.
|
||||
|
||||
These changes should improve resistance against attacks and also solve
|
||||
some of the issues of the original (32bit / XORed table) implementation.
|
||||
|
||||
.. _cache:
|
||||
|
||||
The cache
|
||||
|
|
|
|||
|
|
@ -361,13 +361,19 @@ The chunks stored in the repo are the (compressed, encrypted and authenticated)
|
|||
output of the chunker. The sizes of these stored chunks are influenced by the
|
||||
compression, encryption and authentication.
|
||||
|
||||
buzhash chunker
|
||||
~~~~~~~~~~~~~~~
|
||||
buzhash and buzhash64 chunker
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The buzhash chunker chunks according to the input data, the chunker's
|
||||
parameters and the secret chunker seed (which all influence the chunk boundary
|
||||
The buzhash chunkers chunk according to the input data, the chunker's
|
||||
parameters and secret key material (which all influence the chunk boundary
|
||||
positions).
|
||||
|
||||
Secret key material:
|
||||
|
||||
- "buzhash": chunker seed (32bits), used for XORing the hardcoded buzhash table
|
||||
- "buzhash64": bh64_key (256bits) is derived from ID key, used to cryptographically
|
||||
generate the table.
|
||||
|
||||
Small files below some specific threshold (default: 512 KiB) result in only one
|
||||
chunk (identical content / size as the original file), bigger files result in
|
||||
multiple chunks.
|
||||
|
|
|
|||
|
|
@ -543,6 +543,7 @@ cython_sources = """
|
|||
src/borg/compress.pyx
|
||||
src/borg/crypto/low_level.pyx
|
||||
src/borg/chunkers/buzhash.pyx
|
||||
src/borg/chunkers/buzhash64.pyx
|
||||
src/borg/chunkers/reader.pyx
|
||||
src/borg/hashindex.pyx
|
||||
src/borg/item.pyx
|
||||
|
|
|
|||
3
setup.py
3
setup.py
|
|
@ -51,6 +51,7 @@ cflags = ["-Wall", "-Wextra", "-Wpointer-arith", "-Wno-unreachable-code-fallthro
|
|||
compress_source = "src/borg/compress.pyx"
|
||||
crypto_ll_source = "src/borg/crypto/low_level.pyx"
|
||||
buzhash_source = "src/borg/chunkers/buzhash.pyx"
|
||||
buzhash64_source = "src/borg/chunkers/buzhash64.pyx"
|
||||
reader_source = "src/borg/chunkers/reader.pyx"
|
||||
hashindex_source = "src/borg/hashindex.pyx"
|
||||
item_source = "src/borg/item.pyx"
|
||||
|
|
@ -66,6 +67,7 @@ cython_sources = [
|
|||
compress_source,
|
||||
crypto_ll_source,
|
||||
buzhash_source,
|
||||
buzhash64_source,
|
||||
reader_source,
|
||||
hashindex_source,
|
||||
item_source,
|
||||
|
|
@ -185,6 +187,7 @@ if not on_rtd:
|
|||
Extension("borg.hashindex", [hashindex_source], extra_compile_args=cflags),
|
||||
Extension("borg.item", [item_source], extra_compile_args=cflags),
|
||||
Extension("borg.chunkers.buzhash", [buzhash_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
|
||||
Extension("borg.chunkers.buzhash64", [buzhash64_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
|
||||
Extension("borg.chunkers.reader", [reader_source], extra_compile_args=cflags, undef_macros=["NDEBUG"]),
|
||||
Extension("borg.checksums", **checksums_ext_kwargs),
|
||||
]
|
||||
|
|
|
|||
|
|
@ -351,7 +351,7 @@ class ChunkBuffer:
|
|||
self.packer = msgpack.Packer()
|
||||
self.chunks = []
|
||||
self.key = key
|
||||
self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed, sparse=False)
|
||||
self.chunker = get_chunker(*chunker_params, key=self.key, sparse=False)
|
||||
self.saved_chunks_len = None
|
||||
|
||||
def add(self, item):
|
||||
|
|
@ -1227,7 +1227,7 @@ class FilesystemObjectProcessors:
|
|||
self.hlm = HardLinkManager(id_type=tuple, info_type=(list, type(None))) # (dev, ino) -> chunks or None
|
||||
self.stats = Statistics(output_json=log_json, iec=iec) # threading: done by cache (including progress)
|
||||
self.cwd = os.getcwd()
|
||||
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse)
|
||||
self.chunker = get_chunker(*chunker_params, key=key, sparse=sparse)
|
||||
|
||||
@contextmanager
|
||||
def create_helper(self, path, st, status=None, hardlinkable=True, strip_prefix=None):
|
||||
|
|
@ -1502,7 +1502,7 @@ class TarfileObjectProcessors:
|
|||
self.print_file_status = file_status_printer or (lambda *args: None)
|
||||
|
||||
self.stats = Statistics(output_json=log_json, iec=iec) # threading: done by cache (including progress)
|
||||
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=False)
|
||||
self.chunker = get_chunker(*chunker_params, key=key, sparse=False)
|
||||
self.hlm = HardLinkManager(id_type=str, info_type=list) # path -> chunks
|
||||
|
||||
@contextmanager
|
||||
|
|
@ -2325,7 +2325,7 @@ class ArchiveRecreater:
|
|||
target.process_file_chunks = ChunksProcessor(
|
||||
cache=self.cache, key=self.key, add_item=target.add_item, rechunkify=target.recreate_rechunkify
|
||||
).process_file_chunks
|
||||
target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed, sparse=False)
|
||||
target.chunker = get_chunker(*target.chunker_params, key=self.key, sparse=False)
|
||||
return target
|
||||
|
||||
def create_target_archive(self, name):
|
||||
|
|
|
|||
|
|
@ -146,7 +146,8 @@ class BenchmarkMixIn:
|
|||
pass
|
||||
|
||||
for spec, func in [
|
||||
("buzhash,19,23,21,4095", lambda: chunkit("buzhash", 19, 23, 21, 4095, seed=0, sparse=False)),
|
||||
("buzhash,19,23,21,4095", lambda: chunkit("buzhash", 19, 23, 21, 4095, sparse=False)),
|
||||
("buzhash64,19,23,21,4095", lambda: chunkit("buzhash64", 19, 23, 21, 4095, sparse=False)),
|
||||
("fixed,1048576", lambda: chunkit("fixed", 1048576, sparse=False)),
|
||||
]:
|
||||
print(f"{spec:<24} {size:<10} {timeit(func, number=100):.3f}s")
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ def transfer_chunks(
|
|||
file = ChunkIteratorFileWrapper(chunk_iterator)
|
||||
|
||||
# Create a chunker with the specified parameters
|
||||
chunker = get_chunker(*chunker_params, seed=archive.key.chunk_seed, sparse=False)
|
||||
chunker = get_chunker(*chunker_params, key=archive.key, sparse=False)
|
||||
for chunk in chunker.chunkify(file):
|
||||
if not dry_run:
|
||||
chunk_id, data = cached_hash(chunk, archive.key.id_hash)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
from .buzhash import Chunker
|
||||
from .buzhash64 import ChunkerBuzHash64
|
||||
from .failing import ChunkerFailing
|
||||
from .fixed import ChunkerFixed
|
||||
from .reader import * # noqa
|
||||
|
|
@ -7,12 +8,17 @@ API_VERSION = "1.2_01"
|
|||
|
||||
|
||||
def get_chunker(algo, *params, **kw):
|
||||
key = kw.get("key", None)
|
||||
sparse = kw.get("sparse", False)
|
||||
# key.chunk_seed only has 32bits
|
||||
seed = key.chunk_seed if key is not None else 0
|
||||
# for buzhash64, we want a much longer key, so we derive it from the id key
|
||||
bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b""
|
||||
if algo == "buzhash":
|
||||
seed = kw["seed"]
|
||||
sparse = kw["sparse"]
|
||||
return Chunker(seed, *params, sparse=sparse)
|
||||
if algo == "buzhash64":
|
||||
return ChunkerBuzHash64(bh64_key, *params, sparse=sparse)
|
||||
if algo == "fixed":
|
||||
sparse = kw["sparse"]
|
||||
return ChunkerFixed(*params, sparse=sparse)
|
||||
if algo == "fail":
|
||||
return ChunkerFailing(*params)
|
||||
|
|
|
|||
20
src/borg/chunkers/buzhash64.pyi
Normal file
20
src/borg/chunkers/buzhash64.pyi
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
from typing import List, Iterator, BinaryIO
|
||||
|
||||
from .reader import fmap_entry
|
||||
|
||||
API_VERSION: str
|
||||
|
||||
def buzhash64(data: bytes, key: bytes) -> int: ...
|
||||
def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ...
|
||||
|
||||
class ChunkerBuzHash64:
|
||||
def __init__(
|
||||
self,
|
||||
key: bytes,
|
||||
chunk_min_exp: int,
|
||||
chunk_max_exp: int,
|
||||
hash_mask_bits: int,
|
||||
hash_window_size: int,
|
||||
sparse: bool = False,
|
||||
) -> None: ...
|
||||
def chunkify(self, fd: BinaryIO = None, fh: int = -1, fmap: List[fmap_entry] = None) -> Iterator: ...
|
||||
291
src/borg/chunkers/buzhash64.pyx
Normal file
291
src/borg/chunkers/buzhash64.pyx
Normal file
|
|
@ -0,0 +1,291 @@
|
|||
# cython: language_level=3
|
||||
|
||||
API_VERSION = '1.2_01'
|
||||
|
||||
import cython
|
||||
import time
|
||||
from hashlib import sha256
|
||||
|
||||
from cpython.bytes cimport PyBytes_AsString
|
||||
from libc.stdint cimport uint8_t, uint64_t
|
||||
from libc.stdlib cimport malloc, free
|
||||
from libc.string cimport memcpy, memmove
|
||||
|
||||
from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
|
||||
from .reader import FileReader, Chunk
|
||||
|
||||
# Cyclic polynomial / buzhash
|
||||
#
|
||||
# https://en.wikipedia.org/wiki/Rolling_hash
|
||||
#
|
||||
# http://www.serve.net/buz/Notes.1st.year/HTML/C6/rand.012.html (by "BUZ", the inventor)
|
||||
#
|
||||
# http://www.dcs.gla.ac.uk/~hamer/cakes-talk.pdf (see buzhash slide)
|
||||
#
|
||||
# Some properties of buzhash / of this implementation:
|
||||
#
|
||||
# (1) the hash is designed for inputs <= 64 bytes, but the chunker uses it on a 4095 byte window;
|
||||
# any repeating bytes at distance 64 within those 4095 bytes can cause cancellation within
|
||||
# the hash function, e.g. in "X <any 63 bytes> X", the last X would cancel out the influence
|
||||
# of the first X on the hash value.
|
||||
|
||||
# This seems to be the most reliable way to inline this code, using a C preprocessor macro:
|
||||
cdef extern from *:
|
||||
"""
|
||||
#define BARREL_SHIFT64(v, shift) (((v) << (shift)) | ((v) >> (((64 - (shift)) & 0x3f))))
|
||||
"""
|
||||
uint64_t BARREL_SHIFT64(uint64_t v, uint64_t shift)
|
||||
|
||||
|
||||
@cython.boundscheck(False) # Deactivate bounds checking
|
||||
@cython.wraparound(False) # Deactivate negative indexing.
|
||||
cdef uint64_t* buzhash64_init_table(bytes key):
|
||||
"""Initialize the buzhash table using the given key."""
|
||||
cdef int i
|
||||
cdef uint64_t* table = <uint64_t*>malloc(2048) # 256 * sizeof(uint64_t)
|
||||
for i in range(256):
|
||||
# deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key:
|
||||
v = f"{i:02x}".encode() + key
|
||||
d64 = sha256(v).digest()[:8]
|
||||
table[i] = <uint64_t> int.from_bytes(d64, byteorder='little')
|
||||
return table
|
||||
|
||||
|
||||
@cython.boundscheck(False) # Deactivate bounds checking
|
||||
@cython.wraparound(False) # Deactivate negative indexing.
|
||||
@cython.cdivision(True) # Use C division/modulo semantics for integer division.
|
||||
cdef uint64_t _buzhash64(const unsigned char* data, size_t len, const uint64_t* h):
|
||||
"""Calculate the buzhash of the given data."""
|
||||
cdef uint64_t i
|
||||
cdef uint64_t sum = 0, imod
|
||||
for i in range(len - 1, 0, -1):
|
||||
imod = i & 0x3f
|
||||
sum ^= BARREL_SHIFT64(h[data[0]], imod)
|
||||
data += 1
|
||||
return sum ^ h[data[0]]
|
||||
|
||||
|
||||
@cython.boundscheck(False) # Deactivate bounds checking
|
||||
@cython.wraparound(False) # Deactivate negative indexing.
|
||||
@cython.cdivision(True) # Use C division/modulo semantics for integer division.
|
||||
cdef uint64_t _buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, const uint64_t* h):
|
||||
"""Update the buzhash with a new byte."""
|
||||
cdef uint64_t lenmod = len & 0x3f
|
||||
return BARREL_SHIFT64(sum, 1) ^ BARREL_SHIFT64(h[remove], lenmod) ^ h[add]
|
||||
|
||||
|
||||
cdef class ChunkerBuzHash64:
|
||||
"""
|
||||
Content-Defined Chunker, variable chunk sizes.
|
||||
|
||||
This chunker makes quite some effort to cut mostly chunks of the same-content, even if
|
||||
the content moves to a different offset inside the file. It uses the buzhash
|
||||
rolling-hash algorithm to identify the chunk cutting places by looking at the
|
||||
content inside the moving window and computing the rolling hash value over the
|
||||
window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
|
||||
Additionally it obeys some more criteria, like a minimum and maximum chunk size.
|
||||
It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
|
||||
"""
|
||||
cdef uint64_t chunk_mask
|
||||
cdef uint64_t* table
|
||||
cdef uint8_t* data
|
||||
cdef object _fd # Python object for file descriptor
|
||||
cdef int fh
|
||||
cdef int done, eof
|
||||
cdef size_t min_size, buf_size, window_size, remaining, position, last
|
||||
cdef long long bytes_read, bytes_yielded # off_t in C, using long long for compatibility
|
||||
cdef readonly float chunking_time
|
||||
cdef object file_reader # FileReader instance
|
||||
cdef size_t reader_block_size
|
||||
cdef bint sparse
|
||||
|
||||
def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
|
||||
min_size = 1 << chunk_min_exp
|
||||
max_size = 1 << chunk_max_exp
|
||||
assert max_size <= len(zeros)
|
||||
# see chunker_process, first while loop condition, first term must be able to get True:
|
||||
assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
|
||||
|
||||
self.window_size = hash_window_size
|
||||
self.chunk_mask = (1 << hash_mask_bits) - 1
|
||||
self.min_size = min_size
|
||||
self.table = buzhash64_init_table(key)
|
||||
self.buf_size = max_size
|
||||
self.data = <uint8_t*>malloc(self.buf_size)
|
||||
self.fh = -1
|
||||
self.done = 0
|
||||
self.eof = 0
|
||||
self.remaining = 0
|
||||
self.position = 0
|
||||
self.last = 0
|
||||
self.bytes_read = 0
|
||||
self.bytes_yielded = 0
|
||||
self._fd = None
|
||||
self.chunking_time = 0.0
|
||||
self.reader_block_size = 1024 * 1024
|
||||
self.sparse = sparse
|
||||
|
||||
def __dealloc__(self):
|
||||
"""Free the chunker's resources."""
|
||||
if self.table != NULL:
|
||||
free(self.table)
|
||||
self.table = NULL
|
||||
if self.data != NULL:
|
||||
free(self.data)
|
||||
self.data = NULL
|
||||
|
||||
cdef int fill(self) except 0:
|
||||
"""Fill the chunker's buffer with more data."""
|
||||
cdef ssize_t n
|
||||
cdef object chunk
|
||||
|
||||
# Move remaining data to the beginning of the buffer
|
||||
memmove(self.data, self.data + self.last, self.position + self.remaining - self.last)
|
||||
self.position -= self.last
|
||||
self.last = 0
|
||||
n = self.buf_size - self.position - self.remaining
|
||||
|
||||
if self.eof or n == 0:
|
||||
return 1
|
||||
|
||||
# Use FileReader to read data
|
||||
chunk = self.file_reader.read(n)
|
||||
n = chunk.meta["size"]
|
||||
|
||||
if n > 0:
|
||||
# Only copy data if it's not a hole
|
||||
if chunk.meta["allocation"] == CH_DATA:
|
||||
# Copy data from chunk to our buffer
|
||||
memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(chunk.data), n)
|
||||
else:
|
||||
# For holes, fill with zeros
|
||||
memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(zeros[:n]), n)
|
||||
|
||||
self.remaining += n
|
||||
self.bytes_read += n
|
||||
else:
|
||||
self.eof = 1
|
||||
|
||||
return 1
|
||||
|
||||
cdef object process(self) except *:
|
||||
"""Process the chunker's buffer and return the next chunk."""
|
||||
cdef uint64_t sum, chunk_mask = self.chunk_mask
|
||||
cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size
|
||||
cdef uint8_t* p
|
||||
cdef uint8_t* stop_at
|
||||
cdef size_t did_bytes
|
||||
|
||||
if self.done:
|
||||
if self.bytes_read == self.bytes_yielded:
|
||||
raise StopIteration
|
||||
else:
|
||||
raise Exception("chunkifier byte count mismatch")
|
||||
|
||||
while self.remaining < min_size + window_size + 1 and not self.eof: # see assert in Chunker init
|
||||
if not self.fill():
|
||||
return None
|
||||
|
||||
# Here we either are at eof...
|
||||
if self.eof:
|
||||
self.done = 1
|
||||
if self.remaining:
|
||||
self.bytes_yielded += self.remaining
|
||||
# Return a memory view of the remaining data
|
||||
return memoryview((self.data + self.position)[:self.remaining])
|
||||
else:
|
||||
if self.bytes_read == self.bytes_yielded:
|
||||
raise StopIteration
|
||||
else:
|
||||
raise Exception("chunkifier byte count mismatch")
|
||||
|
||||
# ... or we have at least min_size + window_size + 1 bytes remaining.
|
||||
# We do not want to "cut" a chunk smaller than min_size and the hash
|
||||
# window starts at the potential cutting place.
|
||||
self.position += min_size
|
||||
self.remaining -= min_size
|
||||
sum = _buzhash64(self.data + self.position, window_size, self.table)
|
||||
|
||||
while self.remaining > self.window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size):
|
||||
p = self.data + self.position
|
||||
stop_at = p + self.remaining - window_size
|
||||
|
||||
while p < stop_at and (sum & chunk_mask):
|
||||
sum = _buzhash64_update(sum, p[0], p[window_size], window_size, self.table)
|
||||
p += 1
|
||||
|
||||
did_bytes = p - (self.data + self.position)
|
||||
self.position += did_bytes
|
||||
self.remaining -= did_bytes
|
||||
|
||||
if self.remaining <= window_size:
|
||||
if not self.fill():
|
||||
return None
|
||||
|
||||
if self.remaining <= window_size:
|
||||
self.position += self.remaining
|
||||
self.remaining = 0
|
||||
|
||||
old_last = self.last
|
||||
self.last = self.position
|
||||
n = self.last - old_last
|
||||
self.bytes_yielded += n
|
||||
|
||||
# Return a memory view of the chunk
|
||||
return memoryview((self.data + old_last)[:n])
|
||||
|
||||
def chunkify(self, fd, fh=-1, fmap=None):
|
||||
"""
|
||||
Cut a file into chunks.
|
||||
|
||||
:param fd: Python file object
|
||||
:param fh: OS-level file handle (if available),
|
||||
defaults to -1 which means not to use OS-level fd.
|
||||
:param fmap: a file map, same format as generated by sparsemap
|
||||
"""
|
||||
self._fd = fd
|
||||
self.fh = fh
|
||||
self.file_reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size, sparse=self.sparse, fmap=fmap)
|
||||
self.done = 0
|
||||
self.remaining = 0
|
||||
self.bytes_read = 0
|
||||
self.bytes_yielded = 0
|
||||
self.position = 0
|
||||
self.last = 0
|
||||
self.eof = 0
|
||||
return self
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
started_chunking = time.monotonic()
|
||||
data = self.process()
|
||||
got = len(data)
|
||||
# we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code,
|
||||
# but we can just check if data was all-zero (and either came from a hole
|
||||
# or from stored zeros - we can not detect that here).
|
||||
if zeros.startswith(data):
|
||||
data = None
|
||||
allocation = CH_ALLOC
|
||||
else:
|
||||
allocation = CH_DATA
|
||||
self.chunking_time += time.monotonic() - started_chunking
|
||||
return Chunk(data, size=got, allocation=allocation)
|
||||
|
||||
|
||||
def buzhash64(data, bytes key):
|
||||
cdef uint64_t *table
|
||||
cdef uint64_t sum
|
||||
table = buzhash64_init_table(key)
|
||||
sum = _buzhash64(<const unsigned char *> data, len(data), table)
|
||||
free(table)
|
||||
return sum
|
||||
|
||||
|
||||
def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size_t len, bytes key):
|
||||
cdef uint64_t *table
|
||||
table = buzhash64_init_table(key)
|
||||
sum = _buzhash64_update(sum, remove, add, len, table)
|
||||
free(table)
|
||||
return sum
|
||||
|
|
@ -92,6 +92,7 @@ MAX_SEGMENT_DIR_INDEX = 2**32 - 1
|
|||
|
||||
# chunker algorithms
|
||||
CH_BUZHASH = "buzhash"
|
||||
CH_BUZHASH64 = "buzhash64"
|
||||
CH_FIXED = "fixed"
|
||||
CH_FAIL = "fail"
|
||||
|
||||
|
|
@ -103,6 +104,7 @@ HASH_MASK_BITS = 21 # results in ~2MiB chunks statistically
|
|||
|
||||
# defaults, use --chunker-params to override
|
||||
CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
|
||||
CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
|
||||
|
||||
# chunker params for the items metadata stream, finer granularity
|
||||
ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE)
|
||||
|
|
|
|||
|
|
@ -187,6 +187,21 @@ def ChunkerParams(s):
|
|||
return algo, block_size, header_size
|
||||
if algo == "default" and count == 1: # default
|
||||
return CHUNKER_PARAMS
|
||||
if algo == CH_BUZHASH64 and count == 5: # buzhash64, chunk_min, chunk_max, chunk_mask, window_size
|
||||
chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[1:])
|
||||
if not (chunk_min <= chunk_mask <= chunk_max):
|
||||
raise argparse.ArgumentTypeError("required: chunk_min <= chunk_mask <= chunk_max")
|
||||
if chunk_min < 6:
|
||||
# see comment in 'fixed' algo check
|
||||
raise argparse.ArgumentTypeError(
|
||||
"min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)"
|
||||
)
|
||||
if chunk_max > 23:
|
||||
raise argparse.ArgumentTypeError(
|
||||
"max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)"
|
||||
)
|
||||
# note that for buzhash64, there is no problem with even window_size.
|
||||
return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size
|
||||
# this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
|
||||
if algo == CH_BUZHASH and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
|
||||
chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4 :])
|
||||
|
|
|
|||
77
src/borg/testsuite/chunkers/buzhash64_self_test.py
Normal file
77
src/borg/testsuite/chunkers/buzhash64_self_test.py
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
# Note: these tests are part of the self test, do not use or import pytest functionality here.
|
||||
# See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
from ...chunkers import get_chunker
|
||||
from ...chunkers.buzhash64 import buzhash64, buzhash64_update, ChunkerBuzHash64
|
||||
from ...constants import * # NOQA
|
||||
from .. import BaseTestCase
|
||||
from . import cf
|
||||
|
||||
|
||||
class ChunkerBuzHash64TestCase(BaseTestCase):
|
||||
def test_chunkify64(self):
|
||||
data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y"
|
||||
parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
|
||||
self.assert_equal(len(parts), 2)
|
||||
self.assert_equal(b"".join(parts), data)
|
||||
self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
|
||||
)
|
||||
self.assert_equal(
|
||||
cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
|
||||
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
|
||||
)
|
||||
|
||||
def test_buzhash64(self):
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478)
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910)
|
||||
expected = buzhash64(b"abcdefghijklmnop", b"1")
|
||||
previous = buzhash64(b"Xabcdefghijklmno", b"1")
|
||||
this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1")
|
||||
self.assert_equal(this, expected)
|
||||
# Test with more than 63 bytes to make sure our barrel_shift macro works correctly
|
||||
self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899)
|
||||
|
||||
def test_small_reads64(self):
|
||||
class SmallReadFile:
|
||||
input = b"a" * (20 + 1)
|
||||
|
||||
def read(self, nbytes):
|
||||
self.input = self.input[:-1]
|
||||
return self.input[:1]
|
||||
|
||||
chunker = get_chunker(*CHUNKER64_PARAMS, sparse=False)
|
||||
reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile())))
|
||||
assert reconstructed == b"a" * 20
|
||||
69
src/borg/testsuite/chunkers/buzhash64_test.py
Normal file
69
src/borg/testsuite/chunkers/buzhash64_test.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
from hashlib import sha256
|
||||
from io import BytesIO
|
||||
import os
|
||||
|
||||
from . import cf
|
||||
from ...chunkers import ChunkerBuzHash64
|
||||
from ...constants import * # NOQA
|
||||
from ...helpers import hex_to_bin
|
||||
|
||||
|
||||
def H(data):
|
||||
return sha256(data).digest()
|
||||
|
||||
|
||||
def test_chunkpoints64_unchanged():
|
||||
def twist(size):
|
||||
x = 1
|
||||
a = bytearray(size)
|
||||
for i in range(size):
|
||||
x = (x * 1103515245 + 12345) & 0x7FFFFFFF
|
||||
a[i] = x & 0xFF
|
||||
return a
|
||||
|
||||
data = twist(100000)
|
||||
|
||||
runs = []
|
||||
for winsize in (65, 129, HASH_WINDOW_SIZE, 7351):
|
||||
for minexp in (4, 6, 7, 11, 12):
|
||||
for maxexp in (15, 17):
|
||||
if minexp >= maxexp:
|
||||
continue
|
||||
for maskbits in (4, 7, 10, 12):
|
||||
for key in (b"first_key", b"second_key"):
|
||||
fh = BytesIO(data)
|
||||
chunker = ChunkerBuzHash64(key, minexp, maxexp, maskbits, winsize)
|
||||
chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
|
||||
runs.append(H(b"".join(chunks)))
|
||||
|
||||
# The "correct" hash below matches the existing chunker behavior.
|
||||
# Future chunker optimisations must not change this, or existing repos will bloat.
|
||||
overall_hash = H(b"".join(runs))
|
||||
assert overall_hash == hex_to_bin("ab98713d28c5a544eeb8b6a2b5ba6405847bd6924d45fb7e267d173892ad0cdc")
|
||||
|
||||
|
||||
def test_buzhash64_chunksize_distribution():
|
||||
data = os.urandom(1048576)
|
||||
min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB
|
||||
chunker = ChunkerBuzHash64(b"", min_exp, max_exp, mask, 4095)
|
||||
f = BytesIO(data)
|
||||
chunks = cf(chunker.chunkify(f))
|
||||
del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp
|
||||
chunk_sizes = [len(chunk) for chunk in chunks]
|
||||
chunks_count = len(chunks)
|
||||
min_chunksize_observed = min(chunk_sizes)
|
||||
max_chunksize_observed = max(chunk_sizes)
|
||||
min_count = sum(int(size == 2**min_exp) for size in chunk_sizes)
|
||||
max_count = sum(int(size == 2**max_exp) for size in chunk_sizes)
|
||||
print(
|
||||
f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} "
|
||||
f"min count: {min_count} max count: {max_count}"
|
||||
)
|
||||
# usually there will about 64 chunks
|
||||
assert 32 < chunks_count < 128
|
||||
# chunks always must be between min and max (clipping must work):
|
||||
assert min_chunksize_observed >= 2**min_exp
|
||||
assert max_chunksize_observed <= 2**max_exp
|
||||
# most chunks should be cut due to buzhash triggering, not due to clipping at min/max size:
|
||||
assert min_count < 10
|
||||
assert max_count < 10
|
||||
|
|
@ -69,6 +69,6 @@ class ChunkerTestCase(BaseTestCase):
|
|||
self.input = self.input[:-1]
|
||||
return self.input[:1]
|
||||
|
||||
chunker = get_chunker(*CHUNKER_PARAMS, seed=0, sparse=False)
|
||||
chunker = get_chunker(*CHUNKER_PARAMS, sparse=False)
|
||||
reconstructed = b"".join(cf(chunker.chunkify(SmallReadFile())))
|
||||
assert reconstructed == b"a" * 20
|
||||
|
|
|
|||
Loading…
Reference in a new issue