From 8c299696aa21f6a64eee64663410d7e06e33529f Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Mon, 14 Dec 2020 23:46:04 +0100
Subject: [PATCH 01/17] Chunker: yield Chunk namedtuple instead of
 bytes/memoryview

---
 src/borg/chunker.pyx  | 32 ++++++++++++++++++++++++++++----
 src/borg/constants.py |  3 +++
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index 03122ec4b..0b6f66546 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -4,6 +4,9 @@ API_VERSION = '1.2_01'
 
 import errno
 import os
+from collections import namedtuple
+
+from .constants import CH_DATA, CH_HOLE
 
 from libc.stdlib cimport free
 
@@ -26,6 +29,25 @@ cdef extern from "_chunker.c":
 has_seek_hole = hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')
 
 
+_Chunk = namedtuple('_Chunk', 'meta data')
+_Chunk.__doc__ = """\
+    Chunk namedtuple
+
+    meta is always a dictionary, data depends on allocation.
+
+    on disk data:
+        meta = {'allocation' = CH_DATA, 'size' = size_of_data }
+        data = read_data [bytes or memoryview]
+
+    hole in a sparse file:
+        meta = {'allocation' = CH_HOLE, 'size' = size_of_hole }
+        data = None
+"""
+
+def Chunk(data, **meta):
+    return _Chunk(meta, data)
+
+
 def dread(offset, size, fd=None, fh=-1):
     use_fh = fh >= 0
     if use_fh:
@@ -178,15 +200,16 @@ class ChunkerFixed:
                 if is_data:
                     # read block from the range
                     data = dread(offset, wanted, fd, fh)
+                    got = len(data)
                 else:  # hole
                     # seek over block from the range
                     pos = dseek(wanted, os.SEEK_CUR, fd, fh)
-                    data = self.zeros[:pos - offset]  # for now, create zero-bytes here
-                got = len(data)
+                    data = None
+                    got = pos - offset
                 if got > 0:
                     offset += got
                     range_size -= got
-                    yield data  # later, use a better api that tags data vs. hole
+                    yield Chunk(data, size=got, allocation=CH_DATA if is_data else CH_HOLE)
                 if got < wanted:
                     # we did not get enough data, looks like EOF.
                     return
@@ -233,7 +256,8 @@ cdef class Chunker:
         return self
 
     def __next__(self):
-        return chunker_process(self.chunker)
+        data = chunker_process(self.chunker)
+        return Chunk(data, size=len(data), allocation=CH_DATA)  # no sparse support here
 
 
 def get_chunker(algo, *params, **kw):
diff --git a/src/borg/constants.py b/src/borg/constants.py
index a20719c65..46c2b564c 100644
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@@ -75,6 +75,9 @@ CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH
 # chunker params for the items metadata stream, finer granularity
 ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE)
 
+# normal on-disk data, allocated (but not written, all zeros), not allocated hole (all zeros)
+CH_DATA, CH_ALLOC, CH_HOLE = 0, 1, 2
+
 # operating mode of the files cache (for fast skipping of unchanged files)
 DEFAULT_FILES_CACHE_MODE_UI = 'ctime,size,inode'
 DEFAULT_FILES_CACHE_MODE = 'cis'  # == CacheMode(DEFAULT_FILES_CACHE_MODE_UI)

From 7319f85b546bff883bde7155f28b453a9dc87f93 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Tue, 15 Dec 2020 00:26:32 +0100
Subject: [PATCH 02/17] adapt the existing chunker tests

---
 src/borg/testsuite/chunker.py        | 54 +++++++++++++++++-----------
 src/borg/testsuite/chunker_pytest.py | 29 ++++++++-------
 src/borg/testsuite/chunker_slow.py   |  3 +-
 3 files changed, 50 insertions(+), 36 deletions(-)

diff --git a/src/borg/testsuite/chunker.py b/src/borg/testsuite/chunker.py
index df79441b6..7a0db7d36 100644
--- a/src/borg/testsuite/chunker.py
+++ b/src/borg/testsuite/chunker.py
@@ -8,18 +8,32 @@ from . import BaseTestCase
 #       See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
 
 
+def cf(chunks):
+    """chunk filter"""
+    # this is to simplify testing: either return the data piece (bytes) or the hole length (int).
+    def _cf(chunk):
+        if chunk.meta['allocation'] == CH_DATA:
+            assert len(chunk.data) == chunk.meta['size']
+            return bytes(chunk.data)  # make sure we have bytes, not memoryview
+        if chunk.meta['allocation'] == CH_HOLE:
+            assert chunk.data is None
+            return chunk.meta['size']
+        assert False, "unexpected allocation value"
+    return [_cf(chunk) for chunk in chunks]
+
+
 class ChunkerFixedTestCase(BaseTestCase):
 
     def test_chunkify_just_blocks(self):
         data = b'foobar' * 1500
         chunker = ChunkerFixed(4096)
-        parts = [c for c in chunker.chunkify(BytesIO(data))]
+        parts = cf(chunker.chunkify(BytesIO(data)))
         self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
 
     def test_chunkify_header_and_blocks(self):
         data = b'foobar' * 1500
         chunker = ChunkerFixed(4096, 123)
-        parts = [c for c in chunker.chunkify(BytesIO(data))]
+        parts = cf(chunker.chunkify(BytesIO(data)))
         self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
 
     def test_chunkify_just_blocks_fmap_complete(self):
@@ -30,7 +44,7 @@ class ChunkerFixedTestCase(BaseTestCase):
             (4096, 8192, True),
             (8192, 99999999, True),
         ]
-        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
         self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
 
     def test_chunkify_header_and_blocks_fmap_complete(self):
@@ -42,7 +56,7 @@ class ChunkerFixedTestCase(BaseTestCase):
             (123+4096, 4096, True),
             (123+8192, 4096, True),
         ]
-        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
         self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
 
     def test_chunkify_header_and_blocks_fmap_zeros(self):
@@ -54,9 +68,9 @@ class ChunkerFixedTestCase(BaseTestCase):
             (123+4096, 4096, True),
             (123+8192, 4096, False),
         ]
-        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
-        # because we marked the '_' ranges as holes, we will get '\0' ranges instead!
-        self.assert_equal(parts, [data[0:123], b'\0' * 4096, data[123+4096:123+8192], b'\0' * 4096])
+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
+        # because we marked the '_' ranges as holes, we will get hole ranges instead!
+        self.assert_equal(parts, [data[0:123], 4096, data[123+4096:123+8192], 4096])
 
     def test_chunkify_header_and_blocks_fmap_partial(self):
         data = b'H' * 123 + b'_' * 4096 + b'X' * 4096 + b'_' * 4096
@@ -67,7 +81,7 @@ class ChunkerFixedTestCase(BaseTestCase):
             (123+4096, 4096, True),
             # (123+8192, 4096, False),
         ]
-        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+        parts = cf(chunker.chunkify(BytesIO(data), fmap=fmap))
         # because we left out the '_' ranges from the fmap, we will not get them at all!
         self.assert_equal(parts, [data[0:123], data[123+4096:123+8192]])
 
@@ -76,19 +90,19 @@ class ChunkerTestCase(BaseTestCase):
 
     def test_chunkify(self):
         data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y'
-        parts = [bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
+        parts = cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
         self.assert_equal(len(parts), 2)
         self.assert_equal(b''.join(parts), data)
-        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
-        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
+        self.assert_equal(cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))), [])
+        self.assert_equal(cf(Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
+        self.assert_equal(cf(Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
+        self.assert_equal(cf(Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
+        self.assert_equal(cf(Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarboobaz' * 3])
+        self.assert_equal(cf(Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
+        self.assert_equal(cf(Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
+        self.assert_equal(cf(Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarboobaz' * 3])
+        self.assert_equal(cf(Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
+        self.assert_equal(cf(Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))), [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
 
     def test_buzhash(self):
         self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)
@@ -106,5 +120,5 @@ class ChunkerTestCase(BaseTestCase):
                 return self.input[:1]
 
         chunker = get_chunker(*CHUNKER_PARAMS, seed=0)
-        reconstructed = b''.join(chunker.chunkify(SmallReadFile()))
+        reconstructed = b''.join(cf(chunker.chunkify(SmallReadFile())))
         assert reconstructed == b'a' * 20
diff --git a/src/borg/testsuite/chunker_pytest.py b/src/borg/testsuite/chunker_pytest.py
index daa46bb38..59c7a4515 100644
--- a/src/borg/testsuite/chunker_pytest.py
+++ b/src/borg/testsuite/chunker_pytest.py
@@ -4,6 +4,7 @@ import tempfile
 
 import pytest
 
+from .chunker import cf
 from ..chunker import ChunkerFixed, sparsemap, has_seek_hole
 from ..constants import *  # NOQA
 
@@ -50,20 +51,18 @@ def make_sparsefile(fname, sparsemap, header_size=0):
 
 
 def make_content(sparsemap, header_size=0):
-    with BytesIO() as fd:
-        total = 0
-        if header_size:
-            fd.write(b'H' * header_size)
-            total += header_size
-        for offset, size, is_data in sparsemap:
-            if is_data:
-                fd.write(b'X' * size)
-            else:
-                fd.write(b'\0' * size)
-            total += size
-        content = fd.getvalue()
-    assert len(content) == total
-    return content
+    result = []
+    total = 0
+    if header_size:
+        result.append(b'H' * header_size)
+        total += header_size
+    for offset, size, is_data in sparsemap:
+        if is_data:
+            result.append(b'X' * size)  # bytes!
+        else:
+            result.append(size)  # int!
+        total += size
+    return result
 
 
 def fs_supports_sparse():
@@ -132,7 +131,7 @@ def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse):
     def get_chunks(fname, sparse, header_size):
         chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse)
         with open(fname, 'rb') as fd:
-            return b''.join([c for c in chunker.chunkify(fd)])
+            return cf(chunker.chunkify(fd))
 
     fn = str(tmpdir / fname)
     make_sparsefile(fn, sparse_map, header_size=header_size)
diff --git a/src/borg/testsuite/chunker_slow.py b/src/borg/testsuite/chunker_slow.py
index 2739a735a..4247e2730 100644
--- a/src/borg/testsuite/chunker_slow.py
+++ b/src/borg/testsuite/chunker_slow.py
@@ -1,6 +1,7 @@
 from io import BytesIO
 from binascii import unhexlify
 
+from .chunker import cf
 from ..chunker import Chunker
 from ..crypto.low_level import blake2b_256
 from ..constants import *  # NOQA
@@ -30,7 +31,7 @@ class ChunkerRegressionTestCase(BaseTestCase):
                         for seed in (1849058162, 1234567653):
                             fh = BytesIO(data)
                             chunker = Chunker(seed, minexp, maxexp, maskbits, winsize)
-                            chunks = [blake2b_256(b'', c) for c in chunker.chunkify(fh, -1)]
+                            chunks = [blake2b_256(b'', c) for c in cf(chunker.chunkify(fh, -1))]
                             runs.append(blake2b_256(b'', b''.join(chunks)))
 
         # The "correct" hash below matches the existing chunker behavior.

From 52bd55b29abfc856316e6e26f970c70a3fef7414 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Tue, 15 Dec 2020 02:37:26 +0100
Subject: [PATCH 03/17] integrate Chunk type, avoid hashing holes

---
 src/borg/archive.py | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index 1555536d5..194814687 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -19,7 +19,7 @@ from .logger import create_logger
 logger = create_logger()
 
 from . import xattr
-from .chunker import get_chunker, max_chunk_size
+from .chunker import get_chunker, max_chunk_size, Chunk
 from .cache import ChunkListEntry
 from .crypto.key import key_factory
 from .compress import Compressor, CompressionSpec
@@ -43,6 +43,7 @@ from .helpers import msgpack
 from .helpers import sig_int
 from .patterns import PathPrefixPattern, FnmatchPattern, IECommand
 from .item import Item, ArchiveItem, ItemDiff
+from .lrucache import LRUCache
 from .platform import acl_get, acl_set, set_flags, get_flags, swidth, hostname
 from .remote import cache_if_remote
 from .repository import Repository, LIST_SCAN_LIMIT
@@ -336,7 +337,9 @@ class ChunkBuffer:
         self.buffer.seek(0)
         # The chunker returns a memoryview to its internal buffer,
         # thus a copy is needed before resuming the chunker iterator.
-        chunks = list(bytes(s) for s in self.chunker.chunkify(self.buffer))
+        # note: this is the items metadata stream chunker, we only will get CH_DATA allocation here,
+        #       thus chunk.data will always be data bytes.
+        chunks = list(bytes(chunk.data) for chunk in self.chunker.chunkify(self.buffer))
         self.buffer.seek(0)
         self.buffer.truncate(0)
         # Leave the last partial chunk in the buffer unless flush is True
@@ -1102,6 +1105,8 @@ class ChunksProcessor:
         self.checkpoint_interval = checkpoint_interval
         self.last_checkpoint = time.monotonic()
         self.rechunkify = rechunkify
+        self.zero_chunk_ids = LRUCache(10, dispose=lambda _: None)  # length of all-zero chunk -> chunk_id
+        self.zeros = memoryview(bytes(MAX_DATA_SIZE))
 
     def write_part_file(self, item, from_chunk, number):
         item = Item(internal_dict=item.as_dict())
@@ -1133,8 +1138,22 @@ class ChunksProcessor:
 
     def process_file_chunks(self, item, cache, stats, show_progress, chunk_iter, chunk_processor=None):
         if not chunk_processor:
-            def chunk_processor(data):
-                chunk_entry = cache.add_chunk(self.key.id_hash(data), data, stats, wait=False)
+            def chunk_processor(chunk):
+                allocation = chunk.meta['allocation']
+                if allocation == CH_DATA:
+                    data = chunk.data
+                    chunk_id = self.key.id_hash(data)
+                elif allocation == CH_HOLE:
+                    size = chunk.meta['size']
+                    data = self.zeros[:size]
+                    try:
+                        chunk_id = self.zero_chunk_ids[size]
+                    except KeyError:
+                        chunk_id = self.key.id_hash(data)
+                        self.zero_chunk_ids[size] = chunk_id
+                else:
+                    raise ValueError('unexpected allocation type')
+                chunk_entry = cache.add_chunk(chunk_id, data, stats, wait=False)
                 self.cache.repository.async_response(wait=False)
                 return chunk_entry
 
@@ -1145,8 +1164,8 @@ class ChunksProcessor:
             del item.chunks_healthy
         from_chunk = 0
         part_number = 1
-        for data in chunk_iter:
-            item.chunks.append(chunk_processor(data))
+        for chunk in chunk_iter:
+            item.chunks.append(chunk_processor(chunk))
             if show_progress:
                 stats.show_progress(item=item, dt=0.2)
             from_chunk, part_number = self.maybe_checkpoint(item, from_chunk, part_number, forced=False)
@@ -1982,7 +2001,10 @@ class ArchiveRecreater:
         chunk_processor = partial(self.chunk_processor, target)
         target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor)
 
-    def chunk_processor(self, target, data):
+    def chunk_processor(self, target, chunk):
+        # as this is recreate (we do not read from the fs), we never have holes here
+        assert chunk.meta['allocation'] == CH_DATA
+        data = chunk.data
         chunk_id = self.key.id_hash(data)
         if chunk_id in self.seen_chunks:
             return self.cache.chunk_incref(chunk_id, target.stats)
@@ -2007,7 +2029,7 @@ class ArchiveRecreater:
             yield from target.chunker.chunkify(file)
         else:
             for chunk in chunk_iterator:
-                yield chunk
+                yield Chunk(chunk, size=len(chunk), allocation=CH_DATA)
 
     def save(self, archive, target, comment=None, replace_original=True):
         if self.dry_run:

From 6d0f9a52eb0527f35a36b10a2bfa9e2b9c180fcd Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Tue, 15 Dec 2020 03:28:48 +0100
Subject: [PATCH 04/17] detect all-zero chunks, avoid hashing them

comparing zeros is quicker than hashing them.
the comparison should fail quickly inside non-zero data.
---
 src/borg/archive.py           |  5 +++--
 src/borg/chunker.pyx          | 24 +++++++++++++++++-------
 src/borg/testsuite/chunker.py |  2 +-
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index 194814687..cc5c33fb9 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -1143,7 +1143,7 @@ class ChunksProcessor:
                 if allocation == CH_DATA:
                     data = chunk.data
                     chunk_id = self.key.id_hash(data)
-                elif allocation == CH_HOLE:
+                elif allocation in (CH_HOLE, CH_ALLOC):
                     size = chunk.meta['size']
                     data = self.zeros[:size]
                     try:
@@ -2002,7 +2002,8 @@ class ArchiveRecreater:
         target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor)
 
     def chunk_processor(self, target, chunk):
-        # as this is recreate (we do not read from the fs), we never have holes here
+        # as this is recreate (we do not read from the fs), we never have CH_HOLE here,
+        # but we need to add support for CH_ALLOC - TODO!
         assert chunk.meta['allocation'] == CH_DATA
         data = chunk.data
         chunk_id = self.key.id_hash(data)
diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index 0b6f66546..6bf64784c 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -6,7 +6,7 @@ import errno
 import os
 from collections import namedtuple
 
-from .constants import CH_DATA, CH_HOLE
+from .constants import CH_DATA, CH_ALLOC, CH_HOLE
 
 from libc.stdlib cimport free
 
@@ -35,12 +35,16 @@ _Chunk.__doc__ = """\
 
     meta is always a dictionary, data depends on allocation.
 
-    on disk data:
-        meta = {'allocation' = CH_DATA, 'size' = size_of_data }
+    data chunk read from a DATA range of a file (not from a sparse hole):
+        meta = {'allocation' = CH_DATA, 'size' = size_of_chunk }
         data = read_data [bytes or memoryview]
 
-    hole in a sparse file:
-        meta = {'allocation' = CH_HOLE, 'size' = size_of_hole }
+    all-zero chunk read from a DATA range of a file (not from a sparse hole, but detected to be all-zero):
+        meta = {'allocation' = CH_ALLOC, 'size' = size_of_chunk }
+        data = None
+
+    all-zero chunk from a HOLE range of a file (from a sparse hole):
+        meta = {'allocation' = CH_HOLE, 'size' = size_of_chunk }
         data = None
 """
 
@@ -201,15 +205,21 @@ class ChunkerFixed:
                     # read block from the range
                     data = dread(offset, wanted, fd, fh)
                     got = len(data)
+                    if data == self.zeros[:got]:
+                        data = None
+                        is_zero = True
+                    else:
+                        is_zero = False
                 else:  # hole
                     # seek over block from the range
                     pos = dseek(wanted, os.SEEK_CUR, fd, fh)
-                    data = None
                     got = pos - offset
+                    data = None
+                    is_zero = True
                 if got > 0:
                     offset += got
                     range_size -= got
-                    yield Chunk(data, size=got, allocation=CH_DATA if is_data else CH_HOLE)
+                    yield Chunk(data, size=got, allocation=(CH_ALLOC if is_zero else CH_DATA) if is_data else CH_HOLE)
                 if got < wanted:
                     # we did not get enough data, looks like EOF.
                     return
diff --git a/src/borg/testsuite/chunker.py b/src/borg/testsuite/chunker.py
index 7a0db7d36..1b275978c 100644
--- a/src/borg/testsuite/chunker.py
+++ b/src/borg/testsuite/chunker.py
@@ -15,7 +15,7 @@ def cf(chunks):
         if chunk.meta['allocation'] == CH_DATA:
             assert len(chunk.data) == chunk.meta['size']
             return bytes(chunk.data)  # make sure we have bytes, not memoryview
-        if chunk.meta['allocation'] == CH_HOLE:
+        if chunk.meta['allocation'] in (CH_HOLE, CH_ALLOC):
             assert chunk.data is None
             return chunk.meta['size']
         assert False, "unexpected allocation value"

From 9fd284ce1a9c310571049aa1d7ad0a6fa89b8a26 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 8 Jan 2021 18:38:35 +0100
Subject: [PATCH 05/17] refactor new zero chunk handling to be reusable

---
 src/borg/archive.py  | 20 ++------------------
 src/borg/chunker.pyx | 30 +++++++++++++++++++++++++++++-
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index cc5c33fb9..eff10b4bf 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -19,7 +19,7 @@ from .logger import create_logger
 logger = create_logger()
 
 from . import xattr
-from .chunker import get_chunker, max_chunk_size, Chunk
+from .chunker import get_chunker, max_chunk_size, Chunk, chunk_to_id_data
 from .cache import ChunkListEntry
 from .crypto.key import key_factory
 from .compress import Compressor, CompressionSpec
@@ -43,7 +43,6 @@ from .helpers import msgpack
 from .helpers import sig_int
 from .patterns import PathPrefixPattern, FnmatchPattern, IECommand
 from .item import Item, ArchiveItem, ItemDiff
-from .lrucache import LRUCache
 from .platform import acl_get, acl_set, set_flags, get_flags, swidth, hostname
 from .remote import cache_if_remote
 from .repository import Repository, LIST_SCAN_LIMIT
@@ -1105,8 +1104,6 @@ class ChunksProcessor:
         self.checkpoint_interval = checkpoint_interval
         self.last_checkpoint = time.monotonic()
         self.rechunkify = rechunkify
-        self.zero_chunk_ids = LRUCache(10, dispose=lambda _: None)  # length of all-zero chunk -> chunk_id
-        self.zeros = memoryview(bytes(MAX_DATA_SIZE))
 
     def write_part_file(self, item, from_chunk, number):
         item = Item(internal_dict=item.as_dict())
@@ -1139,20 +1136,7 @@ class ChunksProcessor:
     def process_file_chunks(self, item, cache, stats, show_progress, chunk_iter, chunk_processor=None):
         if not chunk_processor:
             def chunk_processor(chunk):
-                allocation = chunk.meta['allocation']
-                if allocation == CH_DATA:
-                    data = chunk.data
-                    chunk_id = self.key.id_hash(data)
-                elif allocation in (CH_HOLE, CH_ALLOC):
-                    size = chunk.meta['size']
-                    data = self.zeros[:size]
-                    try:
-                        chunk_id = self.zero_chunk_ids[size]
-                    except KeyError:
-                        chunk_id = self.key.id_hash(data)
-                        self.zero_chunk_ids[size] = chunk_id
-                else:
-                    raise ValueError('unexpected allocation type')
+                chunk_id, data = chunk_to_id_data(chunk, self.key.id_hash)
                 chunk_entry = cache.add_chunk(chunk_id, data, stats, wait=False)
                 self.cache.repository.async_response(wait=False)
                 return chunk_entry
diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index 6bf64784c..1fd316a03 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -6,7 +6,8 @@ import errno
 import os
 from collections import namedtuple
 
-from .constants import CH_DATA, CH_ALLOC, CH_HOLE
+from .constants import CH_DATA, CH_ALLOC, CH_HOLE, MAX_DATA_SIZE
+from .lrucache import LRUCache
 
 from libc.stdlib cimport free
 
@@ -52,6 +53,33 @@ def Chunk(data, **meta):
     return _Chunk(meta, data)
 
 
+zeros = bytes(MAX_DATA_SIZE)
+
+# remember a few recently used all-zero chunk hashes in this mapping.
+# (hash_func, chunk_length) -> chunk_hash
+# we play safe and have the hash_func in the mapping key, in case we
+# have different hash_funcs within the same borg run.
+zero_chunk_ids = LRUCache(10, dispose=lambda _: None)
+
+def chunk_to_id_data(chunk, id_hash):
+    allocation = chunk.meta['allocation']
+    if allocation == CH_DATA:
+        data = chunk.data
+        chunk_id = id_hash(data)
+    elif allocation in (CH_HOLE, CH_ALLOC):
+        size = chunk.meta['size']
+        assert size <= len(zeros)
+        data = memoryview(zeros)[:size]
+        try:
+            chunk_id = zero_chunk_ids[(id_hash, size)]
+        except KeyError:
+            chunk_id = id_hash(data)
+            zero_chunk_ids[(id_hash, size)] = chunk_id
+    else:
+        raise ValueError('unexpected allocation type')
+    return chunk_id, data
+
+
 def dread(offset, size, fd=None, fh=-1):
     use_fh = fh >= 0
     if use_fh:

From b3659e0b8c371fb122878bd94e6303f9ec2d5847 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 8 Jan 2021 18:45:46 +0100
Subject: [PATCH 06/17] reuse chunker.zeros for sparse extraction

---
 src/borg/archive.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index eff10b4bf..a3fdd371c 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -19,7 +19,7 @@ from .logger import create_logger
 logger = create_logger()
 
 from . import xattr
-from .chunker import get_chunker, max_chunk_size, Chunk, chunk_to_id_data
+from .chunker import get_chunker, max_chunk_size, Chunk, chunk_to_id_data, zeros
 from .cache import ChunkListEntry
 from .crypto.key import key_factory
 from .compress import Compressor, CompressionSpec
@@ -424,7 +424,6 @@ class Archive:
             if info is None:
                 raise self.DoesNotExist(name)
             self.load(info.id)
-            self.zeros = None
 
     def _load_meta(self, id):
         data = self.key.decrypt(id, self.repository.get(id))
@@ -737,8 +736,6 @@ Utilization of max. archive size: {csize_max:.0%}
                                      hardlink_masters) as hardlink_set:
                 if hardlink_set:
                     return
-                if sparse and self.zeros is None:
-                    self.zeros = b'\0' * max_chunk_size(*self.chunker_params)
                 with backup_io('open'):
                     fd = open(path, 'wb')
                 with fd:
@@ -747,7 +744,7 @@ Utilization of max. archive size: {csize_max:.0%}
                         if pi:
                             pi.show(increase=len(data), info=[remove_surrogates(item.path)])
                         with backup_io('write'):
-                            if sparse and self.zeros.startswith(data):
+                            if sparse and zeros.startswith(data):
                                 # all-zero chunk: create a hole in a sparse file
                                 fd.seek(len(data), 1)
                             else:

From 92f221075aa51a470892eeadbe5f91bddd30a4af Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 8 Jan 2021 18:53:35 +0100
Subject: [PATCH 07/17] refactor recreate to use chunk_to_id_data

---
 src/borg/archive.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index a3fdd371c..6c4aa2b74 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -1983,11 +1983,7 @@ class ArchiveRecreater:
         target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor)
 
     def chunk_processor(self, target, chunk):
-        # as this is recreate (we do not read from the fs), we never have CH_HOLE here,
-        # but we need to add support for CH_ALLOC - TODO!
-        assert chunk.meta['allocation'] == CH_DATA
-        data = chunk.data
-        chunk_id = self.key.id_hash(data)
+        chunk_id, data = chunk_to_id_data(chunk, self.key.id_hash)
         if chunk_id in self.seen_chunks:
             return self.cache.chunk_incref(chunk_id, target.stats)
         overwrite = self.recompress

From f3088a989356476cfc0bf04948a80f714d05a2b0 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 8 Jan 2021 19:16:47 +0100
Subject: [PATCH 08/17] rename chunk_to_id_data to cached_hash

---
 src/borg/archive.py  | 6 +++---
 src/borg/chunker.pyx | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index 6c4aa2b74..9ac37e7f1 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -19,7 +19,7 @@ from .logger import create_logger
 logger = create_logger()
 
 from . import xattr
-from .chunker import get_chunker, max_chunk_size, Chunk, chunk_to_id_data, zeros
+from .chunker import get_chunker, max_chunk_size, Chunk, cached_hash, zeros
 from .cache import ChunkListEntry
 from .crypto.key import key_factory
 from .compress import Compressor, CompressionSpec
@@ -1133,7 +1133,7 @@ class ChunksProcessor:
     def process_file_chunks(self, item, cache, stats, show_progress, chunk_iter, chunk_processor=None):
         if not chunk_processor:
             def chunk_processor(chunk):
-                chunk_id, data = chunk_to_id_data(chunk, self.key.id_hash)
+                chunk_id, data = cached_hash(chunk, self.key.id_hash)
                 chunk_entry = cache.add_chunk(chunk_id, data, stats, wait=False)
                 self.cache.repository.async_response(wait=False)
                 return chunk_entry
@@ -1983,7 +1983,7 @@ class ArchiveRecreater:
         target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor)
 
     def chunk_processor(self, target, chunk):
-        chunk_id, data = chunk_to_id_data(chunk, self.key.id_hash)
+        chunk_id, data = cached_hash(chunk, self.key.id_hash)
         if chunk_id in self.seen_chunks:
             return self.cache.chunk_incref(chunk_id, target.stats)
         overwrite = self.recompress
diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index 1fd316a03..1da811500 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -61,7 +61,7 @@ zeros = bytes(MAX_DATA_SIZE)
 # have different hash_funcs within the same borg run.
 zero_chunk_ids = LRUCache(10, dispose=lambda _: None)
 
-def chunk_to_id_data(chunk, id_hash):
+def cached_hash(chunk, id_hash):
     allocation = chunk.meta['allocation']
     if allocation == CH_DATA:
         data = chunk.data

From ef19d937ed09885b1c545cf179478053f7d3b93d Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 8 Jan 2021 19:29:29 +0100
Subject: [PATCH 09/17] use cached_hash also to generate all-zero replacement
 chunks

at least for major amounts of fixed-size replacement hashes,
this will be much faster. also less memory management overhead.
---
 src/borg/archive.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index 9ac37e7f1..b023a9cac 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -1662,8 +1662,8 @@ class ArchiveChecker:
             If a previously missing file chunk re-appears, the replacement chunk is replaced by the correct one.
             """
             def replacement_chunk(size):
-                data = bytes(size)
-                chunk_id = self.key.id_hash(data)
+                chunk = Chunk(None, allocation=CH_ALLOC, size=size)
+                chunk_id, data = cached_hash(chunk, self.key.id_hash)
                 cdata = self.key.encrypt(data)
                 csize = len(cdata)
                 return chunk_id, size, csize, cdata

From 4e3be1db5e73cb5a90f0a2865ccd961da7bc0661 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 8 Jan 2021 20:03:34 +0100
Subject: [PATCH 10/17] reuse zeros also in fixed-size chunker for all-zero
 chunk detection

also: zeros.startswith() is faster
---
 src/borg/chunker.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index 1da811500..8c3a297ab 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -178,7 +178,7 @@ class ChunkerFixed:
         # should borg try to do sparse input processing?
         # whether it actually can be done depends on the input file being seekable.
         self.try_sparse = sparse and has_seek_hole
-        self.zeros = memoryview(bytes(block_size))
+        assert block_size <= len(zeros)
 
     def chunkify(self, fd=None, fh=-1, fmap=None):
         """
@@ -233,7 +233,7 @@ class ChunkerFixed:
                     # read block from the range
                     data = dread(offset, wanted, fd, fh)
                     got = len(data)
-                    if data == self.zeros[:got]:
+                    if zeros.startswith(data):
                         data = None
                         is_zero = True
                     else:

From 3b9798cffcb5b608bc103841d1d7bc22c41ea5ad Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 14 Jan 2021 19:56:39 +0100
Subject: [PATCH 11/17] remove max_chunk_size (unused)

---
 src/borg/archive.py  | 2 +-
 src/borg/chunker.pyx | 9 ---------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index b023a9cac..00a0867d0 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -19,7 +19,7 @@ from .logger import create_logger
 logger = create_logger()
 
 from . import xattr
-from .chunker import get_chunker, max_chunk_size, Chunk, cached_hash, zeros
+from .chunker import get_chunker, Chunk, cached_hash, zeros
 from .cache import ChunkListEntry
 from .crypto.key import key_factory
 from .compress import Compressor, CompressionSpec
diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index 8c3a297ab..7f763ff24 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -308,15 +308,6 @@ def get_chunker(algo, *params, **kw):
     raise TypeError('unsupported chunker algo %r' % algo)
 
 
-def max_chunk_size(algo, *params):
-    # see also parseformat.ChunkerParams return values
-    if algo == 'buzhash':
-        return 1 << params[1]
-    if algo == 'fixed':
-        return max(params[0], params[1])
-    raise TypeError('unsupported chunker algo %r' % algo)
-
-
 def buzhash(data, unsigned long seed):
     cdef uint32_t *table
     cdef uint32_t sum

From be257728cab4020b21d84ae06564e4af21366cf1 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 14 Jan 2021 20:02:18 +0100
Subject: [PATCH 12/17] move zeros to constants module

---
 src/borg/archive.py   | 2 +-
 src/borg/chunker.pyx  | 4 +---
 src/borg/constants.py | 4 ++++
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index 00a0867d0..90013bd29 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -19,7 +19,7 @@ from .logger import create_logger
 logger = create_logger()
 
 from . import xattr
-from .chunker import get_chunker, Chunk, cached_hash, zeros
+from .chunker import get_chunker, Chunk, cached_hash
 from .cache import ChunkListEntry
 from .crypto.key import key_factory
 from .compress import Compressor, CompressionSpec
diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index 7f763ff24..099532308 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -6,7 +6,7 @@ import errno
 import os
 from collections import namedtuple
 
-from .constants import CH_DATA, CH_ALLOC, CH_HOLE, MAX_DATA_SIZE
+from .constants import CH_DATA, CH_ALLOC, CH_HOLE, MAX_DATA_SIZE, zeros
 from .lrucache import LRUCache
 
 from libc.stdlib cimport free
@@ -53,8 +53,6 @@ def Chunk(data, **meta):
     return _Chunk(meta, data)
 
 
-zeros = bytes(MAX_DATA_SIZE)
-
 # remember a few recently used all-zero chunk hashes in this mapping.
 # (hash_func, chunk_length) -> chunk_hash
 # we play safe and have the hash_func in the mapping key, in case we
diff --git a/src/borg/constants.py b/src/borg/constants.py
index 46c2b564c..1bd9bb6dd 100644
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@@ -45,6 +45,10 @@ assert MAX_OBJECT_SIZE == 20 * 1024 * 1024
 # repo config max_segment_size value must be below this limit to stay within uint32 offsets:
 MAX_SEGMENT_SIZE_LIMIT = 2 ** 32 - MAX_OBJECT_SIZE
 
+# have one all-zero bytes object
+# we use it at all places where we need to detect or create all-zero buffers
+zeros = bytes(MAX_DATA_SIZE)
+
 # borg.remote read() buffer size
 BUFSIZE = 10 * 1024 * 1024
 

From e41dc6e96fe9ca5960d449d1479713cb537682c4 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 14 Jan 2021 20:19:10 +0100
Subject: [PATCH 13/17] use zeros for benchmarks

---
 src/borg/archiver.py            | 3 ++-
 src/borg/testsuite/benchmark.py | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/borg/archiver.py b/src/borg/archiver.py
index 2fe47056d..37f62fb52 100644
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@@ -453,9 +453,10 @@ class Archiver:
         def test_files(path, count, size, random):
             path = os.path.join(path, 'borg-test-data')
             os.makedirs(path)
+            z_buff = None if random else memoryview(zeros)[:size] if size <= len(zeros) else b'\0' * size
             for i in range(count):
                 fname = os.path.join(path, 'file_%d' % i)
-                data = b'\0' * size if not random else os.urandom(size)
+                data = z_buff if not random else os.urandom(size)
                 with SyncFile(fname, binary=True) as fd:  # used for posix_fadvise's sake
                     fd.write(data)
             yield path
diff --git a/src/borg/testsuite/benchmark.py b/src/borg/testsuite/benchmark.py
index 1e70a101f..f3ec06f2a 100644
--- a/src/borg/testsuite/benchmark.py
+++ b/src/borg/testsuite/benchmark.py
@@ -11,6 +11,7 @@ import os
 import pytest
 
 from .archiver import changedir, cmd
+from ..constants import zeros
 
 
 @pytest.fixture
@@ -34,12 +35,13 @@ def repo(request, cmd, repo_url):
 @pytest.fixture(scope='session', params=["zeros", "random"])
 def testdata(request, tmpdir_factory):
     count, size = 10, 1000*1000
+    assert size <= len(zeros)
     p = tmpdir_factory.mktemp('data')
     data_type = request.param
     if data_type == 'zeros':
         # do not use a binary zero (\0) to avoid sparse detection
         def data(size):
-            return b'0' * size
+            return memoryview(zeros)[:size]
     elif data_type == 'random':
         def data(size):
             return os.urandom(size)

From 8162e2e67b7b817d6b01a60df52bb67256fd2ba3 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 14 Jan 2021 20:41:57 +0100
Subject: [PATCH 14/17] cached_hash is only used in archive, move it there

---
 src/borg/archive.py  | 29 ++++++++++++++++++++++++++++-
 src/borg/chunker.pyx | 26 --------------------------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index 90013bd29..b597a1aa6 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -19,7 +19,7 @@ from .logger import create_logger
 logger = create_logger()
 
 from . import xattr
-from .chunker import get_chunker, Chunk, cached_hash
+from .chunker import get_chunker, Chunk
 from .cache import ChunkListEntry
 from .crypto.key import key_factory
 from .compress import Compressor, CompressionSpec
@@ -41,6 +41,7 @@ from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi
 from .helpers import os_open, flags_normal, flags_dir
 from .helpers import msgpack
 from .helpers import sig_int
+from .lrucache import LRUCache
 from .patterns import PathPrefixPattern, FnmatchPattern, IECommand
 from .item import Item, ArchiveItem, ItemDiff
 from .platform import acl_get, acl_set, set_flags, get_flags, swidth, hostname
@@ -1088,6 +1089,32 @@ class MetadataCollector:
         return attrs
 
 
+# remember a few recently used all-zero chunk hashes in this mapping.
+# (hash_func, chunk_length) -> chunk_hash
+# we play safe and have the hash_func in the mapping key, in case we
+# have different hash_funcs within the same borg run.
+zero_chunk_ids = LRUCache(10, dispose=lambda _: None)
+
+
+def cached_hash(chunk, id_hash):
+    allocation = chunk.meta['allocation']
+    if allocation == CH_DATA:
+        data = chunk.data
+        chunk_id = id_hash(data)
+    elif allocation in (CH_HOLE, CH_ALLOC):
+        size = chunk.meta['size']
+        assert size <= len(zeros)
+        data = memoryview(zeros)[:size]
+        try:
+            chunk_id = zero_chunk_ids[(id_hash, size)]
+        except KeyError:
+            chunk_id = id_hash(data)
+            zero_chunk_ids[(id_hash, size)] = chunk_id
+    else:
+        raise ValueError('unexpected allocation type')
+    return chunk_id, data
+
+
 class ChunksProcessor:
     # Processes an iterator of chunks for an Item
 
diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index 099532308..210ea461c 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -7,7 +7,6 @@ import os
 from collections import namedtuple
 
 from .constants import CH_DATA, CH_ALLOC, CH_HOLE, MAX_DATA_SIZE, zeros
-from .lrucache import LRUCache
 
 from libc.stdlib cimport free
 
@@ -53,31 +52,6 @@ def Chunk(data, **meta):
     return _Chunk(meta, data)
 
 
-# remember a few recently used all-zero chunk hashes in this mapping.
-# (hash_func, chunk_length) -> chunk_hash
-# we play safe and have the hash_func in the mapping key, in case we
-# have different hash_funcs within the same borg run.
-zero_chunk_ids = LRUCache(10, dispose=lambda _: None)
-
-def cached_hash(chunk, id_hash):
-    allocation = chunk.meta['allocation']
-    if allocation == CH_DATA:
-        data = chunk.data
-        chunk_id = id_hash(data)
-    elif allocation in (CH_HOLE, CH_ALLOC):
-        size = chunk.meta['size']
-        assert size <= len(zeros)
-        data = memoryview(zeros)[:size]
-        try:
-            chunk_id = zero_chunk_ids[(id_hash, size)]
-        except KeyError:
-            chunk_id = id_hash(data)
-            zero_chunk_ids[(id_hash, size)] = chunk_id
-    else:
-        raise ValueError('unexpected allocation type')
-    return chunk_id, data
-
-
 def dread(offset, size, fd=None, fh=-1):
     use_fh = fh >= 0
     if use_fh:

From 2d7636521496d4e76b9d23ea635e9ec7faa37713 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 15 Jan 2021 21:10:07 +0100
Subject: [PATCH 15/17] cosmetic: directly set allocation instead going via
 is_zero

---
 src/borg/chunker.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index 210ea461c..3af84807b 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -207,19 +207,19 @@ class ChunkerFixed:
                     got = len(data)
                     if zeros.startswith(data):
                         data = None
-                        is_zero = True
+                        allocation = CH_ALLOC
                     else:
-                        is_zero = False
+                        allocation = CH_DATA
                 else:  # hole
                     # seek over block from the range
                     pos = dseek(wanted, os.SEEK_CUR, fd, fh)
                     got = pos - offset
                     data = None
-                    is_zero = True
+                    allocation = CH_HOLE
                 if got > 0:
                     offset += got
                     range_size -= got
-                    yield Chunk(data, size=got, allocation=(CH_ALLOC if is_zero else CH_DATA) if is_data else CH_HOLE)
+                    yield Chunk(data, size=got, allocation=allocation)
                 if got < wanted:
                     # we did not get enough data, looks like EOF.
                     return

From 2391d160a80fd53deb7f9eb137b24d7fa78a067b Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 15 Jan 2021 21:27:29 +0100
Subject: [PATCH 16/17] add all-zero detection to buzhash chunk data processing

---
 src/borg/chunker.pyx | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index 3af84807b..ee9773be4 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -242,6 +242,7 @@ cdef class Chunker:
     def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
         min_size = 1 << chunk_min_exp
         max_size = 1 << chunk_max_exp
+        assert max_size <= len(zeros)
         # see chunker_process, first while loop condition, first term must be able to get True:
         assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
         hash_mask = (1 << hash_mask_bits) - 1
@@ -267,7 +268,16 @@ cdef class Chunker:
 
     def __next__(self):
         data = chunker_process(self.chunker)
-        return Chunk(data, size=len(data), allocation=CH_DATA)  # no sparse support here
+        got = len(data)
+        # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code,
+        # but we can just check if data was all-zero (and either came from a hole
+        # or from stored zeros - we can not detect that here).
+        if zeros.startswith(data):
+            data = None
+            allocation = CH_ALLOC
+        else:
+            allocation = CH_DATA
+        return Chunk(data, size=got, allocation=allocation)
 
 
 def get_chunker(algo, *params, **kw):

From 6dc334422e8e34c6f7e45728ab8956c57e3bcd30 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 15 Jan 2021 21:51:15 +0100
Subject: [PATCH 17/17] fixup: improve comment about assumptions in the item
 metadata stream chunker

---
 src/borg/archive.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index b597a1aa6..c0e2fe0f5 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -337,8 +337,9 @@ class ChunkBuffer:
         self.buffer.seek(0)
         # The chunker returns a memoryview to its internal buffer,
         # thus a copy is needed before resuming the chunker iterator.
-        # note: this is the items metadata stream chunker, we only will get CH_DATA allocation here,
-        #       thus chunk.data will always be data bytes.
+        # note: this is the items metadata stream chunker, we only will get CH_DATA allocation here (because there are,
+        #       no all-zero chunks in a metadata stream), thus chunk.data will always be bytes/memoryview and allocation
+        #       is always CH_DATA and never CH_ALLOC/CH_HOLE).
         chunks = list(bytes(chunk.data) for chunk in self.chunker.chunkify(self.buffer))
         self.buffer.seek(0)
         self.buffer.truncate(0)