From 01f72d15b4c4ebd5dd21e6787dacf0b44d433d5c Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Tue, 3 May 2022 20:51:43 +0200
Subject: [PATCH] transfer: remove the zlib type bytes hack

hack: see the docstring of ZLIB_legacy class.

New clean ZLIB class that works as every other compressor.

ZLIB ID 0x0500, ZLIB_legacy ID 0x.8..
---
 src/borg/archiver.py           |  8 +++++-
 src/borg/compress.pyx          | 51 ++++++++++++++++++++++++++++++----
 src/borg/testsuite/archiver.py |  4 +--
 src/borg/testsuite/compress.py |  4 +--
 4 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/src/borg/archiver.py b/src/borg/archiver.py
index 0c5dee5f1..098208167 100644
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@@ -44,7 +44,7 @@ try:
     from .archive import has_link
     from .cache import Cache, assert_secure, SecurityManager
     from .constants import *  # NOQA
-    from .compress import CompressionSpec
+    from .compress import CompressionSpec, ZLIB, ZLIB_legacy
     from .crypto.key import key_creator, key_argument_names, tam_required_file, tam_required
     from .crypto.key import RepoKey, KeyfileKey, Blake2RepoKey, Blake2KeyfileKey, FlexiKey
     from .crypto.keymanager import KeyManager
@@ -351,6 +351,11 @@ class Archiver:
             item.get_size(memorize=True)  # if not already present: compute+remember size for items with chunks
             return item
 
+        def upgrade_compressed_chunk(chunk):
+            if ZLIB_legacy.detect(chunk):
+                chunk = ZLIB.ID + chunk  # get rid of the attic legacy: prepend separate type bytes for zlib
+            return chunk
+
         dry_run = args.dry_run
 
         args.consider_checkpoints = True
@@ -378,6 +383,7 @@ class Archiver:
                                     cdata = other_repository.get(chunk_id)
                                     # keep compressed payload same, avoid decompression / recompression
                                     data = other_key.decrypt(chunk_id, cdata, decompress=False)
+                                    data = upgrade_compressed_chunk(data)
                                     chunk_entry = cache.add_chunk(chunk_id, data, archive.stats, wait=False,
                                                                   compress=False, size=size)
                                     cache.repository.async_response(wait=False)
diff --git a/src/borg/compress.pyx b/src/borg/compress.pyx
index 2e0eb4809..7997456c6 100644
--- a/src/borg/compress.pyx
+++ b/src/borg/compress.pyx
@@ -331,14 +331,52 @@ class ZSTD(DecidingCompressor):
         return dest[:osize]
 
 
-class ZLIB(CompressorBase):
+class ZLIB(DecidingCompressor):
     """
     zlib compression / decompression (python stdlib)
     """
-    ID = b'\x08\x00'  # not used here, see detect()
-                      # avoid all 0x.8.. IDs elsewhere!
+    ID = b'\x05\x00'
     name = 'zlib'
 
+    def __init__(self, level=6, **kwargs):
+        super().__init__(**kwargs)
+        self.level = level
+
+    def _decide(self, data):
+        """
+        Decides what to do with *data*. Returns (compressor, zlib_data).
+
+        *zlib_data* is the ZLIB result if *compressor* is ZLIB as well, otherwise it is None.
+        """
+        zlib_data = zlib.compress(data, self.level)
+        if len(zlib_data) < len(data):
+            return self, zlib_data
+        else:
+            return NONE_COMPRESSOR, None
+
+    def decompress(self, data):
+        data = super().decompress(data)
+        try:
+            return zlib.decompress(data)
+        except zlib.error as e:
+            raise DecompressionError(str(e)) from None
+
+
+class ZLIB_legacy(CompressorBase):
+    """
+    zlib compression / decompression (python stdlib)
+
+    Note: This is the legacy ZLIB support as used by borg < 1.3.
+          It still suffers from attic *only* supporting zlib and not having separate
+          ID bytes to differentiate between differently compressed chunks.
+          This just works because zlib compressed stuff always starts with 0x.8.. bytes.
+          Newer borg uses the ZLIB class that has separate ID bytes (as all the other
+          compressors) and does not need this hack.
+    """
+    ID = b'\x08\x00'  # not used here, see detect()
+    # avoid all 0x.8.. IDs elsewhere!
+    name = 'zlib_legacy'
+
     @classmethod
     def detect(cls, data):
         # matches misc. patterns 0x.8.. used by zlib
@@ -502,13 +540,14 @@ COMPRESSOR_TABLE = {
     CNONE.name: CNONE,
     LZ4.name: LZ4,
     ZLIB.name: ZLIB,
+    ZLIB_legacy.name: ZLIB_legacy,
     LZMA.name: LZMA,
     Auto.name: Auto,
     ZSTD.name: ZSTD,
     ObfuscateSize.name: ObfuscateSize,
 }
 # List of possible compression types. Does not include Auto, since it is a meta-Compressor.
-COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, LZMA, ObfuscateSize, ]  # check fast stuff first
+COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, ZLIB_legacy, LZMA, ObfuscateSize, ]  # check fast stuff first
 
 def get_compressor(name, **kwargs):
     cls = COMPRESSOR_TABLE[name]
@@ -554,7 +593,7 @@ class CompressionSpec:
         self.name = values[0]
         if self.name in ('none', 'lz4', ):
             return
-        elif self.name in ('zlib', 'lzma', ):
+        elif self.name in ('zlib', 'lzma', 'zlib_legacy'):  # zlib_legacy just for testing
             if count < 2:
                 level = 6  # default compression level in py stdlib
             elif count == 2:
@@ -597,7 +636,7 @@ class CompressionSpec:
     def compressor(self):
         if self.name in ('none', 'lz4', ):
             return get_compressor(self.name)
-        elif self.name in ('zlib', 'lzma', 'zstd', ):
+        elif self.name in ('zlib', 'lzma', 'zstd', 'zlib_legacy'):
             return get_compressor(self.name, level=self.level)
         elif self.name == 'auto':
             return get_compressor(self.name, compressor=self.inner.compressor)
diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py
index 5889b12ab..b69fe819f 100644
--- a/src/borg/testsuite/archiver.py
+++ b/src/borg/testsuite/archiver.py
@@ -2442,7 +2442,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
     def test_compression_zlib_compressible(self):
         size, csize = self._get_sizes('zlib', compressible=True)
         assert csize < size * 0.1
-        assert csize == 35
+        assert csize == 37
 
     def test_compression_zlib_uncompressible(self):
         size, csize = self._get_sizes('zlib', compressible=False)
@@ -2451,7 +2451,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
     def test_compression_auto_compressible(self):
         size, csize = self._get_sizes('auto,zlib', compressible=True)
         assert csize < size * 0.1
-        assert csize == 35  # same as compression 'zlib'
+        assert csize == 37  # same as compression 'zlib'
 
     def test_compression_auto_uncompressible(self):
         size, csize = self._get_sizes('auto,zlib', compressible=False)
diff --git a/src/borg/testsuite/compress.py b/src/borg/testsuite/compress.py
index 3942c3537..c93dd3bb6 100644
--- a/src/borg/testsuite/compress.py
+++ b/src/borg/testsuite/compress.py
@@ -88,11 +88,11 @@ def test_autodetect_invalid():
         Compressor(**params).decompress(b'\x08\x00notreallyzlib')
 
 
-def test_zlib_compat():
+def test_zlib_legacy_compat():
     # for compatibility reasons, we do not add an extra header for zlib,
     # nor do we expect one when decompressing / autodetecting
     for level in range(10):
-        c = get_compressor(name='zlib', level=level)
+        c = get_compressor(name='zlib_legacy', level=level)
         cdata1 = c.compress(data)
         cdata2 = zlib.compress(data, level)
         assert cdata1 == cdata2