From 8c299696aa21f6a64eee64663410d7e06e33529f Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 14 Dec 2020 23:46:04 +0100 Subject: [PATCH 01/17] Chunker: yield Chunk namedtuple instead of bytes/memoryview --- src/borg/chunker.pyx | 32 ++++++++++++++++++++++++++++---- src/borg/constants.py | 3 +++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 03122ec4b..0b6f66546 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -4,6 +4,9 @@ API_VERSION = '1.2_01' import errno import os +from collections import namedtuple + +from .constants import CH_DATA, CH_HOLE from libc.stdlib cimport free @@ -26,6 +29,25 @@ cdef extern from "_chunker.c": has_seek_hole = hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE') +_Chunk = namedtuple('_Chunk', 'meta data') +_Chunk.__doc__ = """\ + Chunk namedtuple + + meta is always a dictionary, data depends on allocation. + + on disk data: + meta = {'allocation' = CH_DATA, 'size' = size_of_data } + data = read_data [bytes or memoryview] + + hole in a sparse file: + meta = {'allocation' = CH_HOLE, 'size' = size_of_hole } + data = None +""" + +def Chunk(data, **meta): + return _Chunk(meta, data) + + def dread(offset, size, fd=None, fh=-1): use_fh = fh >= 0 if use_fh: @@ -178,15 +200,16 @@ class ChunkerFixed: if is_data: # read block from the range data = dread(offset, wanted, fd, fh) + got = len(data) else: # hole # seek over block from the range pos = dseek(wanted, os.SEEK_CUR, fd, fh) - data = self.zeros[:pos - offset] # for now, create zero-bytes here - got = len(data) + data = None + got = pos - offset if got > 0: offset += got range_size -= got - yield data # later, use a better api that tags data vs. hole + yield Chunk(data, size=got, allocation=CH_DATA if is_data else CH_HOLE) if got < wanted: # we did not get enough data, looks like EOF. return @@ -233,7 +256,8 @@ cdef class Chunker: return self def __next__(self): - return chunker_process(self.chunker) + data = chunker_process(self.chunker) + return Chunk(data, size=len(data), allocation=CH_DATA) # no sparse support here def get_chunker(algo, *params, **kw): diff --git a/src/borg/constants.py b/src/borg/constants.py index a20719c65..46c2b564c 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -75,6 +75,9 @@ CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH # chunker params for the items metadata stream, finer granularity ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE) +# normal on-disk data, allocated (but not written, all zeros), not allocated hole (all zeros) +CH_DATA, CH_ALLOC, CH_HOLE = 0, 1, 2 + # operating mode of the files cache (for fast skipping of unchanged files) DEFAULT_FILES_CACHE_MODE_UI = 'ctime,size,inode' DEFAULT_FILES_CACHE_MODE = 'cis' # == CacheMode(DEFAULT_FILES_CACHE_MODE_UI) From 7319f85b546bff883bde7155f28b453a9dc87f93 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 15 Dec 2020 00:26:32 +0100 Subject: [PATCH 02/17] adapt the existing chunker tests --- src/borg/testsuite/chunker.py | 54 +++++++++++++++++----------- src/borg/testsuite/chunker_pytest.py | 29 ++++++++------- src/borg/testsuite/chunker_slow.py | 3 +- 3 files changed, 50 insertions(+), 36 deletions(-) diff --git a/src/borg/testsuite/chunker.py b/src/borg/testsuite/chunker.py index df79441b6..7a0db7d36 100644 --- a/src/borg/testsuite/chunker.py +++ b/src/borg/testsuite/chunker.py @@ -8,18 +8,32 @@ from . import BaseTestCase # See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT +def cf(chunks): + """chunk filter""" + # this is to simplify testing: either return the data piece (bytes) or the hole length (int). + def _cf(chunk): + if chunk.meta['allocation'] == CH_DATA: + assert len(chunk.data) == chunk.meta['size'] + return bytes(chunk.data) # make sure we have bytes, not memoryview + if chunk.meta['allocation'] == CH_HOLE: + assert chunk.data is None + return chunk.meta['size'] + assert False, "unexpected allocation value" + return [_cf(chunk) for chunk in chunks] + + class ChunkerFixedTestCase(BaseTestCase): def test_chunkify_just_blocks(self): data = b'foobar' * 1500 chunker = ChunkerFixed(4096) - parts = [c for c in chunker.chunkify(BytesIO(data))] + parts = cf(chunker.chunkify(BytesIO(data))) self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]]) def test_chunkify_header_and_blocks(self): data = b'foobar' * 1500 chunker = ChunkerFixed(4096, 123) - parts = [c for c in chunker.chunkify(BytesIO(data))] + parts = cf(chunker.chunkify(BytesIO(data))) self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]]) def test_chunkify_just_blocks_fmap_complete(self): @@ -30,7 +44,7 @@ class ChunkerFixedTestCase(BaseTestCase): (4096, 8192, True), (8192, 99999999, True), ] - parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)] + parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap)) self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]]) def test_chunkify_header_and_blocks_fmap_complete(self): @@ -42,7 +56,7 @@ class ChunkerFixedTestCase(BaseTestCase): (123+4096, 4096, True), (123+8192, 4096, True), ] - parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)] + parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap)) self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]]) def test_chunkify_header_and_blocks_fmap_zeros(self): @@ -54,9 +68,9 @@ class ChunkerFixedTestCase(BaseTestCase): (123+4096, 4096, True), (123+8192, 4096, False), ] - parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)] - # because we marked the '_' ranges as holes, we will get '\0' ranges instead! - self.assert_equal(parts, [data[0:123], b'\0' * 4096, data[123+4096:123+8192], b'\0' * 4096]) + parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap)) + # because we marked the '_' ranges as holes, we will get hole ranges instead! + self.assert_equal(parts, [data[0:123], 4096, data[123+4096:123+8192], 4096]) def test_chunkify_header_and_blocks_fmap_partial(self): data = b'H' * 123 + b'_' * 4096 + b'X' * 4096 + b'_' * 4096 @@ -67,7 +81,7 @@ class ChunkerFixedTestCase(BaseTestCase): (123+4096, 4096, True), # (123+8192, 4096, False), ] - parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)] + parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap)) # because we left out the '_' ranges from the fmap, we will not get them at all! self.assert_equal(parts, [data[0:123], data[123+4096:123+8192]]) @@ -76,19 +90,19 @@ class ChunkerTestCase(BaseTestCase): def test_chunkify(self): data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y' - parts = [bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))] + parts = cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))) self.assert_equal(len(parts), 2) self.assert_equal(b''.join(parts), data) - self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], []) - self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz']) - self.assert_equal([bytes(c) for c in Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz']) - self.assert_equal([bytes(c) for c in Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz']) - self.assert_equal([bytes(c) for c in Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3]) - self.assert_equal([bytes(c) for c in Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz']) - self.assert_equal([bytes(c) for c in Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz']) - self.assert_equal([bytes(c) for c in Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3]) - self.assert_equal([bytes(c) for c in Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz']) - self.assert_equal([bytes(c) for c in Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz']) + self.assert_equal(cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))), []) + self.assert_equal(cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz']) + self.assert_equal(cf(Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz']) + self.assert_equal(cf(Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz']) + self.assert_equal(cf(Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarboobaz' * 3]) + self.assert_equal(cf(Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz']) + self.assert_equal(cf(Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz']) + self.assert_equal(cf(Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarboobaz' * 3]) + self.assert_equal(cf(Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz']) + self.assert_equal(cf(Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz']) def test_buzhash(self): self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769) @@ -106,5 +120,5 @@ class ChunkerTestCase(BaseTestCase): return self.input[:1] chunker = get_chunker(*CHUNKER_PARAMS, seed=0) - reconstructed = b''.join(chunker.chunkify(SmallReadFile())) + reconstructed = b''.join(cf(chunker.chunkify(SmallReadFile()))) assert reconstructed == b'a' * 20 diff --git a/src/borg/testsuite/chunker_pytest.py b/src/borg/testsuite/chunker_pytest.py index daa46bb38..59c7a4515 100644 --- a/src/borg/testsuite/chunker_pytest.py +++ b/src/borg/testsuite/chunker_pytest.py @@ -4,6 +4,7 @@ import tempfile import pytest +from .chunker import cf from ..chunker import ChunkerFixed, sparsemap, has_seek_hole from ..constants import * # NOQA @@ -50,20 +51,18 @@ def make_sparsefile(fname, sparsemap, header_size=0): def make_content(sparsemap, header_size=0): - with BytesIO() as fd: - total = 0 - if header_size: - fd.write(b'H' * header_size) - total += header_size - for offset, size, is_data in sparsemap: - if is_data: - fd.write(b'X' * size) - else: - fd.write(b'\0' * size) - total += size - content = fd.getvalue() - assert len(content) == total - return content + result = [] + total = 0 + if header_size: + result.append(b'H' * header_size) + total += header_size + for offset, size, is_data in sparsemap: + if is_data: + result.append(b'X' * size) # bytes! + else: + result.append(size) # int! + total += size + return result def fs_supports_sparse(): @@ -132,7 +131,7 @@ def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse): def get_chunks(fname, sparse, header_size): chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse) with open(fname, 'rb') as fd: - return b''.join([c for c in chunker.chunkify(fd)]) + return cf(chunker.chunkify(fd)) fn = str(tmpdir / fname) make_sparsefile(fn, sparse_map, header_size=header_size) diff --git a/src/borg/testsuite/chunker_slow.py b/src/borg/testsuite/chunker_slow.py index 2739a735a..4247e2730 100644 --- a/src/borg/testsuite/chunker_slow.py +++ b/src/borg/testsuite/chunker_slow.py @@ -1,6 +1,7 @@ from io import BytesIO from binascii import unhexlify +from .chunker import cf from ..chunker import Chunker from ..crypto.low_level import blake2b_256 from ..constants import * # NOQA @@ -30,7 +31,7 @@ class ChunkerRegressionTestCase(BaseTestCase): for seed in (1849058162, 1234567653): fh = BytesIO(data) chunker = Chunker(seed, minexp, maxexp, maskbits, winsize) - chunks = [blake2b_256(b'', c) for c in chunker.chunkify(fh, -1)] + chunks = [blake2b_256(b'', c) for c in cf(chunker.chunkify(fh, -1))] runs.append(blake2b_256(b'', b''.join(chunks))) # The "correct" hash below matches the existing chunker behavior. From 52bd55b29abfc856316e6e26f970c70a3fef7414 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 15 Dec 2020 02:37:26 +0100 Subject: [PATCH 03/17] integrate Chunk type, avoid hashing holes --- src/borg/archive.py | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 1555536d5..194814687 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -19,7 +19,7 @@ from .logger import create_logger logger = create_logger() from . import xattr -from .chunker import get_chunker, max_chunk_size +from .chunker import get_chunker, max_chunk_size, Chunk from .cache import ChunkListEntry from .crypto.key import key_factory from .compress import Compressor, CompressionSpec @@ -43,6 +43,7 @@ from .helpers import msgpack from .helpers import sig_int from .patterns import PathPrefixPattern, FnmatchPattern, IECommand from .item import Item, ArchiveItem, ItemDiff +from .lrucache import LRUCache from .platform import acl_get, acl_set, set_flags, get_flags, swidth, hostname from .remote import cache_if_remote from .repository import Repository, LIST_SCAN_LIMIT @@ -336,7 +337,9 @@ class ChunkBuffer: self.buffer.seek(0) # The chunker returns a memoryview to its internal buffer, # thus a copy is needed before resuming the chunker iterator. - chunks = list(bytes(s) for s in self.chunker.chunkify(self.buffer)) + # note: this is the items metadata stream chunker, we only will get CH_DATA allocation here, + # thus chunk.data will always be data bytes. + chunks = list(bytes(chunk.data) for chunk in self.chunker.chunkify(self.buffer)) self.buffer.seek(0) self.buffer.truncate(0) # Leave the last partial chunk in the buffer unless flush is True @@ -1102,6 +1105,8 @@ class ChunksProcessor: self.checkpoint_interval = checkpoint_interval self.last_checkpoint = time.monotonic() self.rechunkify = rechunkify + self.zero_chunk_ids = LRUCache(10, dispose=lambda _: None) # length of all-zero chunk -> chunk_id + self.zeros = memoryview(bytes(MAX_DATA_SIZE)) def write_part_file(self, item, from_chunk, number): item = Item(internal_dict=item.as_dict()) @@ -1133,8 +1138,22 @@ class ChunksProcessor: def process_file_chunks(self, item, cache, stats, show_progress, chunk_iter, chunk_processor=None): if not chunk_processor: - def chunk_processor(data): - chunk_entry = cache.add_chunk(self.key.id_hash(data), data, stats, wait=False) + def chunk_processor(chunk): + allocation = chunk.meta['allocation'] + if allocation == CH_DATA: + data = chunk.data + chunk_id = self.key.id_hash(data) + elif allocation == CH_HOLE: + size = chunk.meta['size'] + data = self.zeros[:size] + try: + chunk_id = self.zero_chunk_ids[size] + except KeyError: + chunk_id = self.key.id_hash(data) + self.zero_chunk_ids[size] = chunk_id + else: + raise ValueError('unexpected allocation type') + chunk_entry = cache.add_chunk(chunk_id, data, stats, wait=False) self.cache.repository.async_response(wait=False) return chunk_entry @@ -1145,8 +1164,8 @@ class ChunksProcessor: del item.chunks_healthy from_chunk = 0 part_number = 1 - for data in chunk_iter: - item.chunks.append(chunk_processor(data)) + for chunk in chunk_iter: + item.chunks.append(chunk_processor(chunk)) if show_progress: stats.show_progress(item=item, dt=0.2) from_chunk, part_number = self.maybe_checkpoint(item, from_chunk, part_number, forced=False) @@ -1982,7 +2001,10 @@ class ArchiveRecreater: chunk_processor = partial(self.chunk_processor, target) target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor) - def chunk_processor(self, target, data): + def chunk_processor(self, target, chunk): + # as this is recreate (we do not read from the fs), we never have holes here + assert chunk.meta['allocation'] == CH_DATA + data = chunk.data chunk_id = self.key.id_hash(data) if chunk_id in self.seen_chunks: return self.cache.chunk_incref(chunk_id, target.stats) @@ -2007,7 +2029,7 @@ class ArchiveRecreater: yield from target.chunker.chunkify(file) else: for chunk in chunk_iterator: - yield chunk + yield Chunk(chunk, size=len(chunk), allocation=CH_DATA) def save(self, archive, target, comment=None, replace_original=True): if self.dry_run: From 6d0f9a52eb0527f35a36b10a2bfa9e2b9c180fcd Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 15 Dec 2020 03:28:48 +0100 Subject: [PATCH 04/17] detect all-zero chunks, avoid hashing them comparing zeros is quicker than hashing them. the comparison should fail quickly inside non-zero data. --- src/borg/archive.py | 5 +++-- src/borg/chunker.pyx | 24 +++++++++++++++++------- src/borg/testsuite/chunker.py | 2 +- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 194814687..cc5c33fb9 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1143,7 +1143,7 @@ class ChunksProcessor: if allocation == CH_DATA: data = chunk.data chunk_id = self.key.id_hash(data) - elif allocation == CH_HOLE: + elif allocation in (CH_HOLE, CH_ALLOC): size = chunk.meta['size'] data = self.zeros[:size] try: @@ -2002,7 +2002,8 @@ class ArchiveRecreater: target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor) def chunk_processor(self, target, chunk): - # as this is recreate (we do not read from the fs), we never have holes here + # as this is recreate (we do not read from the fs), we never have CH_HOLE here, + # but we need to add support for CH_ALLOC - TODO! assert chunk.meta['allocation'] == CH_DATA data = chunk.data chunk_id = self.key.id_hash(data) diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 0b6f66546..6bf64784c 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -6,7 +6,7 @@ import errno import os from collections import namedtuple -from .constants import CH_DATA, CH_HOLE +from .constants import CH_DATA, CH_ALLOC, CH_HOLE from libc.stdlib cimport free @@ -35,12 +35,16 @@ _Chunk.__doc__ = """\ meta is always a dictionary, data depends on allocation. - on disk data: - meta = {'allocation' = CH_DATA, 'size' = size_of_data } + data chunk read from a DATA range of a file (not from a sparse hole): + meta = {'allocation' = CH_DATA, 'size' = size_of_chunk } data = read_data [bytes or memoryview] - hole in a sparse file: - meta = {'allocation' = CH_HOLE, 'size' = size_of_hole } + all-zero chunk read from a DATA range of a file (not from a sparse hole, but detected to be all-zero): + meta = {'allocation' = CH_ALLOC, 'size' = size_of_chunk } + data = None + + all-zero chunk from a HOLE range of a file (from a sparse hole): + meta = {'allocation' = CH_HOLE, 'size' = size_of_chunk } data = None """ @@ -201,15 +205,21 @@ class ChunkerFixed: # read block from the range data = dread(offset, wanted, fd, fh) got = len(data) + if data == self.zeros[:got]: + data = None + is_zero = True + else: + is_zero = False else: # hole # seek over block from the range pos = dseek(wanted, os.SEEK_CUR, fd, fh) - data = None got = pos - offset + data = None + is_zero = True if got > 0: offset += got range_size -= got - yield Chunk(data, size=got, allocation=CH_DATA if is_data else CH_HOLE) + yield Chunk(data, size=got, allocation=(CH_ALLOC if is_zero else CH_DATA) if is_data else CH_HOLE) if got < wanted: # we did not get enough data, looks like EOF. return diff --git a/src/borg/testsuite/chunker.py b/src/borg/testsuite/chunker.py index 7a0db7d36..1b275978c 100644 --- a/src/borg/testsuite/chunker.py +++ b/src/borg/testsuite/chunker.py @@ -15,7 +15,7 @@ def cf(chunks): if chunk.meta['allocation'] == CH_DATA: assert len(chunk.data) == chunk.meta['size'] return bytes(chunk.data) # make sure we have bytes, not memoryview - if chunk.meta['allocation'] == CH_HOLE: + if chunk.meta['allocation'] in (CH_HOLE, CH_ALLOC): assert chunk.data is None return chunk.meta['size'] assert False, "unexpected allocation value" From 9fd284ce1a9c310571049aa1d7ad0a6fa89b8a26 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 8 Jan 2021 18:38:35 +0100 Subject: [PATCH 05/17] refactor new zero chunk handling to be reusable --- src/borg/archive.py | 20 ++------------------ src/borg/chunker.pyx | 30 +++++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index cc5c33fb9..eff10b4bf 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -19,7 +19,7 @@ from .logger import create_logger logger = create_logger() from . import xattr -from .chunker import get_chunker, max_chunk_size, Chunk +from .chunker import get_chunker, max_chunk_size, Chunk, chunk_to_id_data from .cache import ChunkListEntry from .crypto.key import key_factory from .compress import Compressor, CompressionSpec @@ -43,7 +43,6 @@ from .helpers import msgpack from .helpers import sig_int from .patterns import PathPrefixPattern, FnmatchPattern, IECommand from .item import Item, ArchiveItem, ItemDiff -from .lrucache import LRUCache from .platform import acl_get, acl_set, set_flags, get_flags, swidth, hostname from .remote import cache_if_remote from .repository import Repository, LIST_SCAN_LIMIT @@ -1105,8 +1104,6 @@ class ChunksProcessor: self.checkpoint_interval = checkpoint_interval self.last_checkpoint = time.monotonic() self.rechunkify = rechunkify - self.zero_chunk_ids = LRUCache(10, dispose=lambda _: None) # length of all-zero chunk -> chunk_id - self.zeros = memoryview(bytes(MAX_DATA_SIZE)) def write_part_file(self, item, from_chunk, number): item = Item(internal_dict=item.as_dict()) @@ -1139,20 +1136,7 @@ class ChunksProcessor: def process_file_chunks(self, item, cache, stats, show_progress, chunk_iter, chunk_processor=None): if not chunk_processor: def chunk_processor(chunk): - allocation = chunk.meta['allocation'] - if allocation == CH_DATA: - data = chunk.data - chunk_id = self.key.id_hash(data) - elif allocation in (CH_HOLE, CH_ALLOC): - size = chunk.meta['size'] - data = self.zeros[:size] - try: - chunk_id = self.zero_chunk_ids[size] - except KeyError: - chunk_id = self.key.id_hash(data) - self.zero_chunk_ids[size] = chunk_id - else: - raise ValueError('unexpected allocation type') + chunk_id, data = chunk_to_id_data(chunk, self.key.id_hash) chunk_entry = cache.add_chunk(chunk_id, data, stats, wait=False) self.cache.repository.async_response(wait=False) return chunk_entry diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 6bf64784c..1fd316a03 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -6,7 +6,8 @@ import errno import os from collections import namedtuple -from .constants import CH_DATA, CH_ALLOC, CH_HOLE +from .constants import CH_DATA, CH_ALLOC, CH_HOLE, MAX_DATA_SIZE +from .lrucache import LRUCache from libc.stdlib cimport free @@ -52,6 +53,33 @@ def Chunk(data, **meta): return _Chunk(meta, data) +zeros = bytes(MAX_DATA_SIZE) + +# remember a few recently used all-zero chunk hashes in this mapping. +# (hash_func, chunk_length) -> chunk_hash +# we play safe and have the hash_func in the mapping key, in case we +# have different hash_funcs within the same borg run. +zero_chunk_ids = LRUCache(10, dispose=lambda _: None) + +def chunk_to_id_data(chunk, id_hash): + allocation = chunk.meta['allocation'] + if allocation == CH_DATA: + data = chunk.data + chunk_id = id_hash(data) + elif allocation in (CH_HOLE, CH_ALLOC): + size = chunk.meta['size'] + assert size <= len(zeros) + data = memoryview(zeros)[:size] + try: + chunk_id = zero_chunk_ids[(id_hash, size)] + except KeyError: + chunk_id = id_hash(data) + zero_chunk_ids[(id_hash, size)] = chunk_id + else: + raise ValueError('unexpected allocation type') + return chunk_id, data + + def dread(offset, size, fd=None, fh=-1): use_fh = fh >= 0 if use_fh: From b3659e0b8c371fb122878bd94e6303f9ec2d5847 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 8 Jan 2021 18:45:46 +0100 Subject: [PATCH 06/17] reuse chunker.zeros for sparse extraction --- src/borg/archive.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index eff10b4bf..a3fdd371c 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -19,7 +19,7 @@ from .logger import create_logger logger = create_logger() from . import xattr -from .chunker import get_chunker, max_chunk_size, Chunk, chunk_to_id_data +from .chunker import get_chunker, max_chunk_size, Chunk, chunk_to_id_data, zeros from .cache import ChunkListEntry from .crypto.key import key_factory from .compress import Compressor, CompressionSpec @@ -424,7 +424,6 @@ class Archive: if info is None: raise self.DoesNotExist(name) self.load(info.id) - self.zeros = None def _load_meta(self, id): data = self.key.decrypt(id, self.repository.get(id)) @@ -737,8 +736,6 @@ Utilization of max. archive size: {csize_max:.0%} hardlink_masters) as hardlink_set: if hardlink_set: return - if sparse and self.zeros is None: - self.zeros = b'\0' * max_chunk_size(*self.chunker_params) with backup_io('open'): fd = open(path, 'wb') with fd: @@ -747,7 +744,7 @@ Utilization of max. archive size: {csize_max:.0%} if pi: pi.show(increase=len(data), info=[remove_surrogates(item.path)]) with backup_io('write'): - if sparse and self.zeros.startswith(data): + if sparse and zeros.startswith(data): # all-zero chunk: create a hole in a sparse file fd.seek(len(data), 1) else: From 92f221075aa51a470892eeadbe5f91bddd30a4af Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 8 Jan 2021 18:53:35 +0100 Subject: [PATCH 07/17] refactor recreate to use chunk_to_id_data --- src/borg/archive.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index a3fdd371c..6c4aa2b74 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1983,11 +1983,7 @@ class ArchiveRecreater: target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor) def chunk_processor(self, target, chunk): - # as this is recreate (we do not read from the fs), we never have CH_HOLE here, - # but we need to add support for CH_ALLOC - TODO! - assert chunk.meta['allocation'] == CH_DATA - data = chunk.data - chunk_id = self.key.id_hash(data) + chunk_id, data = chunk_to_id_data(chunk, self.key.id_hash) if chunk_id in self.seen_chunks: return self.cache.chunk_incref(chunk_id, target.stats) overwrite = self.recompress From f3088a989356476cfc0bf04948a80f714d05a2b0 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 8 Jan 2021 19:16:47 +0100 Subject: [PATCH 08/17] rename chunk_to_id_data to cached_hash --- src/borg/archive.py | 6 +++--- src/borg/chunker.pyx | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 6c4aa2b74..9ac37e7f1 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -19,7 +19,7 @@ from .logger import create_logger logger = create_logger() from . import xattr -from .chunker import get_chunker, max_chunk_size, Chunk, chunk_to_id_data, zeros +from .chunker import get_chunker, max_chunk_size, Chunk, cached_hash, zeros from .cache import ChunkListEntry from .crypto.key import key_factory from .compress import Compressor, CompressionSpec @@ -1133,7 +1133,7 @@ class ChunksProcessor: def process_file_chunks(self, item, cache, stats, show_progress, chunk_iter, chunk_processor=None): if not chunk_processor: def chunk_processor(chunk): - chunk_id, data = chunk_to_id_data(chunk, self.key.id_hash) + chunk_id, data = cached_hash(chunk, self.key.id_hash) chunk_entry = cache.add_chunk(chunk_id, data, stats, wait=False) self.cache.repository.async_response(wait=False) return chunk_entry @@ -1983,7 +1983,7 @@ class ArchiveRecreater: target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor) def chunk_processor(self, target, chunk): - chunk_id, data = chunk_to_id_data(chunk, self.key.id_hash) + chunk_id, data = cached_hash(chunk, self.key.id_hash) if chunk_id in self.seen_chunks: return self.cache.chunk_incref(chunk_id, target.stats) overwrite = self.recompress diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 1fd316a03..1da811500 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -61,7 +61,7 @@ zeros = bytes(MAX_DATA_SIZE) # have different hash_funcs within the same borg run. zero_chunk_ids = LRUCache(10, dispose=lambda _: None) -def chunk_to_id_data(chunk, id_hash): +def cached_hash(chunk, id_hash): allocation = chunk.meta['allocation'] if allocation == CH_DATA: data = chunk.data From ef19d937ed09885b1c545cf179478053f7d3b93d Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 8 Jan 2021 19:29:29 +0100 Subject: [PATCH 09/17] use cached_hash also to generate all-zero replacement chunks at least for major amounts of fixed-size replacement hashes, this will be much faster. also less memory management overhead. --- src/borg/archive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 9ac37e7f1..b023a9cac 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1662,8 +1662,8 @@ class ArchiveChecker: If a previously missing file chunk re-appears, the replacement chunk is replaced by the correct one. """ def replacement_chunk(size): - data = bytes(size) - chunk_id = self.key.id_hash(data) + chunk = Chunk(None, allocation=CH_ALLOC, size=size) + chunk_id, data = cached_hash(chunk, self.key.id_hash) cdata = self.key.encrypt(data) csize = len(cdata) return chunk_id, size, csize, cdata From 4e3be1db5e73cb5a90f0a2865ccd961da7bc0661 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 8 Jan 2021 20:03:34 +0100 Subject: [PATCH 10/17] reuse zeros also in fixed-size chunker for all-zero chunk detection also: zeros.startswith() is faster --- src/borg/chunker.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 1da811500..8c3a297ab 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -178,7 +178,7 @@ class ChunkerFixed: # should borg try to do sparse input processing? # whether it actually can be done depends on the input file being seekable. self.try_sparse = sparse and has_seek_hole - self.zeros = memoryview(bytes(block_size)) + assert block_size <= len(zeros) def chunkify(self, fd=None, fh=-1, fmap=None): """ @@ -233,7 +233,7 @@ class ChunkerFixed: # read block from the range data = dread(offset, wanted, fd, fh) got = len(data) - if data == self.zeros[:got]: + if zeros.startswith(data): data = None is_zero = True else: From 3b9798cffcb5b608bc103841d1d7bc22c41ea5ad Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 14 Jan 2021 19:56:39 +0100 Subject: [PATCH 11/17] remove max_chunk_size (unused) --- src/borg/archive.py | 2 +- src/borg/chunker.pyx | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index b023a9cac..00a0867d0 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -19,7 +19,7 @@ from .logger import create_logger logger = create_logger() from . import xattr -from .chunker import get_chunker, max_chunk_size, Chunk, cached_hash, zeros +from .chunker import get_chunker, Chunk, cached_hash, zeros from .cache import ChunkListEntry from .crypto.key import key_factory from .compress import Compressor, CompressionSpec diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 8c3a297ab..7f763ff24 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -308,15 +308,6 @@ def get_chunker(algo, *params, **kw): raise TypeError('unsupported chunker algo %r' % algo) -def max_chunk_size(algo, *params): - # see also parseformat.ChunkerParams return values - if algo == 'buzhash': - return 1 << params[1] - if algo == 'fixed': - return max(params[0], params[1]) - raise TypeError('unsupported chunker algo %r' % algo) - - def buzhash(data, unsigned long seed): cdef uint32_t *table cdef uint32_t sum From be257728cab4020b21d84ae06564e4af21366cf1 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 14 Jan 2021 20:02:18 +0100 Subject: [PATCH 12/17] move zeros to constants module --- src/borg/archive.py | 2 +- src/borg/chunker.pyx | 4 +--- src/borg/constants.py | 4 ++++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 00a0867d0..90013bd29 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -19,7 +19,7 @@ from .logger import create_logger logger = create_logger() from . import xattr -from .chunker import get_chunker, Chunk, cached_hash, zeros +from .chunker import get_chunker, Chunk, cached_hash from .cache import ChunkListEntry from .crypto.key import key_factory from .compress import Compressor, CompressionSpec diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 7f763ff24..099532308 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -6,7 +6,7 @@ import errno import os from collections import namedtuple -from .constants import CH_DATA, CH_ALLOC, CH_HOLE, MAX_DATA_SIZE +from .constants import CH_DATA, CH_ALLOC, CH_HOLE, MAX_DATA_SIZE, zeros from .lrucache import LRUCache from libc.stdlib cimport free @@ -53,8 +53,6 @@ def Chunk(data, **meta): return _Chunk(meta, data) -zeros = bytes(MAX_DATA_SIZE) - # remember a few recently used all-zero chunk hashes in this mapping. # (hash_func, chunk_length) -> chunk_hash # we play safe and have the hash_func in the mapping key, in case we diff --git a/src/borg/constants.py b/src/borg/constants.py index 46c2b564c..1bd9bb6dd 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -45,6 +45,10 @@ assert MAX_OBJECT_SIZE == 20 * 1024 * 1024 # repo config max_segment_size value must be below this limit to stay within uint32 offsets: MAX_SEGMENT_SIZE_LIMIT = 2 ** 32 - MAX_OBJECT_SIZE +# have one all-zero bytes object +# we use it at all places where we need to detect or create all-zero buffers +zeros = bytes(MAX_DATA_SIZE) + # borg.remote read() buffer size BUFSIZE = 10 * 1024 * 1024 From e41dc6e96fe9ca5960d449d1479713cb537682c4 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 14 Jan 2021 20:19:10 +0100 Subject: [PATCH 13/17] use zeros for benchmarks --- src/borg/archiver.py | 3 ++- src/borg/testsuite/benchmark.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 2fe47056d..37f62fb52 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -453,9 +453,10 @@ class Archiver: def test_files(path, count, size, random): path = os.path.join(path, 'borg-test-data') os.makedirs(path) + z_buff = None if random else memoryview(zeros)[:size] if size <= len(zeros) else b'\0' * size for i in range(count): fname = os.path.join(path, 'file_%d' % i) - data = b'\0' * size if not random else os.urandom(size) + data = z_buff if not random else os.urandom(size) with SyncFile(fname, binary=True) as fd: # used for posix_fadvise's sake fd.write(data) yield path diff --git a/src/borg/testsuite/benchmark.py b/src/borg/testsuite/benchmark.py index 1e70a101f..f3ec06f2a 100644 --- a/src/borg/testsuite/benchmark.py +++ b/src/borg/testsuite/benchmark.py @@ -11,6 +11,7 @@ import os import pytest from .archiver import changedir, cmd +from ..constants import zeros @pytest.fixture @@ -34,12 +35,13 @@ def repo(request, cmd, repo_url): @pytest.fixture(scope='session', params=["zeros", "random"]) def testdata(request, tmpdir_factory): count, size = 10, 1000*1000 + assert size <= len(zeros) p = tmpdir_factory.mktemp('data') data_type = request.param if data_type == 'zeros': # do not use a binary zero (\0) to avoid sparse detection def data(size): - return b'0' * size + return memoryview(zeros)[:size] elif data_type == 'random': def data(size): return os.urandom(size) From 8162e2e67b7b817d6b01a60df52bb67256fd2ba3 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 14 Jan 2021 20:41:57 +0100 Subject: [PATCH 14/17] cached_hash is only used in archive, move it there --- src/borg/archive.py | 29 ++++++++++++++++++++++++++++- src/borg/chunker.pyx | 26 -------------------------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 90013bd29..b597a1aa6 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -19,7 +19,7 @@ from .logger import create_logger logger = create_logger() from . import xattr -from .chunker import get_chunker, Chunk, cached_hash +from .chunker import get_chunker, Chunk from .cache import ChunkListEntry from .crypto.key import key_factory from .compress import Compressor, CompressionSpec @@ -41,6 +41,7 @@ from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi from .helpers import os_open, flags_normal, flags_dir from .helpers import msgpack from .helpers import sig_int +from .lrucache import LRUCache from .patterns import PathPrefixPattern, FnmatchPattern, IECommand from .item import Item, ArchiveItem, ItemDiff from .platform import acl_get, acl_set, set_flags, get_flags, swidth, hostname @@ -1088,6 +1089,32 @@ class MetadataCollector: return attrs +# remember a few recently used all-zero chunk hashes in this mapping. +# (hash_func, chunk_length) -> chunk_hash +# we play safe and have the hash_func in the mapping key, in case we +# have different hash_funcs within the same borg run. +zero_chunk_ids = LRUCache(10, dispose=lambda _: None) + + +def cached_hash(chunk, id_hash): + allocation = chunk.meta['allocation'] + if allocation == CH_DATA: + data = chunk.data + chunk_id = id_hash(data) + elif allocation in (CH_HOLE, CH_ALLOC): + size = chunk.meta['size'] + assert size <= len(zeros) + data = memoryview(zeros)[:size] + try: + chunk_id = zero_chunk_ids[(id_hash, size)] + except KeyError: + chunk_id = id_hash(data) + zero_chunk_ids[(id_hash, size)] = chunk_id + else: + raise ValueError('unexpected allocation type') + return chunk_id, data + + class ChunksProcessor: # Processes an iterator of chunks for an Item diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 099532308..210ea461c 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -7,7 +7,6 @@ import os from collections import namedtuple from .constants import CH_DATA, CH_ALLOC, CH_HOLE, MAX_DATA_SIZE, zeros -from .lrucache import LRUCache from libc.stdlib cimport free @@ -53,31 +52,6 @@ def Chunk(data, **meta): return _Chunk(meta, data) -# remember a few recently used all-zero chunk hashes in this mapping. -# (hash_func, chunk_length) -> chunk_hash -# we play safe and have the hash_func in the mapping key, in case we -# have different hash_funcs within the same borg run. -zero_chunk_ids = LRUCache(10, dispose=lambda _: None) - -def cached_hash(chunk, id_hash): - allocation = chunk.meta['allocation'] - if allocation == CH_DATA: - data = chunk.data - chunk_id = id_hash(data) - elif allocation in (CH_HOLE, CH_ALLOC): - size = chunk.meta['size'] - assert size <= len(zeros) - data = memoryview(zeros)[:size] - try: - chunk_id = zero_chunk_ids[(id_hash, size)] - except KeyError: - chunk_id = id_hash(data) - zero_chunk_ids[(id_hash, size)] = chunk_id - else: - raise ValueError('unexpected allocation type') - return chunk_id, data - - def dread(offset, size, fd=None, fh=-1): use_fh = fh >= 0 if use_fh: From 2d7636521496d4e76b9d23ea635e9ec7faa37713 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 15 Jan 2021 21:10:07 +0100 Subject: [PATCH 15/17] cosmetic: directly set allocation instead going via is_zero --- src/borg/chunker.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 210ea461c..3af84807b 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -207,19 +207,19 @@ class ChunkerFixed: got = len(data) if zeros.startswith(data): data = None - is_zero = True + allocation = CH_ALLOC else: - is_zero = False + allocation = CH_DATA else: # hole # seek over block from the range pos = dseek(wanted, os.SEEK_CUR, fd, fh) got = pos - offset data = None - is_zero = True + allocation = CH_HOLE if got > 0: offset += got range_size -= got - yield Chunk(data, size=got, allocation=(CH_ALLOC if is_zero else CH_DATA) if is_data else CH_HOLE) + yield Chunk(data, size=got, allocation=allocation) if got < wanted: # we did not get enough data, looks like EOF. return From 2391d160a80fd53deb7f9eb137b24d7fa78a067b Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 15 Jan 2021 21:27:29 +0100 Subject: [PATCH 16/17] add all-zero detection to buzhash chunk data processing --- src/borg/chunker.pyx | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 3af84807b..ee9773be4 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -242,6 +242,7 @@ cdef class Chunker: def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size): min_size = 1 << chunk_min_exp max_size = 1 << chunk_max_exp + assert max_size <= len(zeros) # see chunker_process, first while loop condition, first term must be able to get True: assert hash_window_size + min_size + 1 <= max_size, "too small max_size" hash_mask = (1 << hash_mask_bits) - 1 @@ -267,7 +268,16 @@ cdef class Chunker: def __next__(self): data = chunker_process(self.chunker) - return Chunk(data, size=len(data), allocation=CH_DATA) # no sparse support here + got = len(data) + # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code, + # but we can just check if data was all-zero (and either came from a hole + # or from stored zeros - we can not detect that here). + if zeros.startswith(data): + data = None + allocation = CH_ALLOC + else: + allocation = CH_DATA + return Chunk(data, size=got, allocation=allocation) def get_chunker(algo, *params, **kw): From 6dc334422e8e34c6f7e45728ab8956c57e3bcd30 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 15 Jan 2021 21:51:15 +0100 Subject: [PATCH 17/17] fixup: improve comment about assumptions in the item metadata stream chunker --- src/borg/archive.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index b597a1aa6..c0e2fe0f5 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -337,8 +337,9 @@ class ChunkBuffer: self.buffer.seek(0) # The chunker returns a memoryview to its internal buffer, # thus a copy is needed before resuming the chunker iterator. - # note: this is the items metadata stream chunker, we only will get CH_DATA allocation here, - # thus chunk.data will always be data bytes. + # note: this is the items metadata stream chunker, we only will get CH_DATA allocation here (because there are, + # no all-zero chunks in a metadata stream), thus chunk.data will always be bytes/memoryview and allocation + # is always CH_DATA and never CH_ALLOC/CH_HOLE). chunks = list(bytes(chunk.data) for chunk in self.chunker.chunkify(self.buffer)) self.buffer.seek(0) self.buffer.truncate(0)