diff --git a/src/borg/compress.pyx b/src/borg/compress.pyx index c0b0aafdf..797a3ff1a 100644 --- a/src/borg/compress.pyx +++ b/src/borg/compress.pyx @@ -72,7 +72,8 @@ cdef class CompressorBase: using Auto compression it needs to determine the _actual_ target compression of a chunk in order to detect whether it should be recompressed. - For all Compressors that are not Auto this always returns *self*. + Any compressor may return a compressor other than *self*, like e.g. the CNONE compressor, + and should actually do so if *data* would become larger on compression. """ return self @@ -95,6 +96,50 @@ cdef class CompressorBase: # strip ID bytes return data[2:] +cdef class DecidingCompressor(CompressorBase): + """ + base class for (de)compression classes that (based on an internal _decide + method) decide whether and how to compress data. + """ + name = 'decidebaseclass' + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def _decide(self, data): + """ + Decides what to do with *data*. Returns (compressor, compressed_data). + + *compressed_data* can be the result of *data* being processed by *compressor*, + if that is generated as a side-effect of the decision process, or None otherwise. + + This private method allows for more efficient implementation of compress() + and decide_compress() making use of *compressed_data*, if already generated. + """ + raise NotImplementedError + + def decide(self, data): + return self._decide(data)[0] + + def decide_compress(self, data): + """ + Decides what to do with *data* and handle accordingly. Returns (compressor, compressed_data). + + *compressed_data* is the result of *data* being processed by *compressor*. + """ + compressor, compressed_data = self._decide(data) + + if compressed_data is None: + compressed_data = compressor.compress(data) + + if compressor is self: + # call super class to add ID bytes + return self, super().compress(compressed_data) + + return compressor, compressed_data + + def compress(self, data): + return self.decide_compress(data)[1] class CNONE(CompressorBase): """ @@ -113,7 +158,7 @@ class CNONE(CompressorBase): return data -class LZ4(CompressorBase): +class LZ4(DecidingCompressor): """ raw LZ4 compression / decompression (liblz4). @@ -128,7 +173,12 @@ class LZ4(CompressorBase): def __init__(self, **kwargs): pass - def compress(self, idata): + def _decide(self, idata): + """ + Decides what to do with *data*. Returns (compressor, lz4_data). + + *lz4_data* is the LZ4 result if *compressor* is LZ4 as well, otherwise it is None. + """ if not isinstance(idata, bytes): idata = bytes(idata) # code below does not work with memoryview cdef int isize = len(idata) @@ -142,7 +192,11 @@ class LZ4(CompressorBase): osize = LZ4_compress_default(source, dest, isize, osize) if not osize: raise Exception('lz4 compress failed') - return super().compress(dest[:osize]) + # only compress if the result actually is smaller + if osize < isize: + return self, dest[:osize] + else: + return NONE_COMPRESSOR, None def decompress(self, idata): if not isinstance(idata, bytes): @@ -174,7 +228,7 @@ class LZ4(CompressorBase): return dest[:rsize] -class LZMA(CompressorBase): +class LZMA(DecidingCompressor): """ lzma compression / decompression """ @@ -187,10 +241,18 @@ class LZMA(CompressorBase): if lzma is None: raise ValueError('No lzma support found.') - def compress(self, data): + def _decide(self, data): + """ + Decides what to do with *data*. Returns (compressor, lzma_data). + + *lzma_data* is the LZMA result if *compressor* is LZMA as well, otherwise it is None. + """ # we do not need integrity checks in lzma, we do that already - data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE) - return super().compress(data) + lzma_data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE) + if len(lzma_data) < len(data): + return self, lzma_data + else: + return NONE_COMPRESSOR, None def decompress(self, data): data = super().decompress(data) @@ -200,7 +262,7 @@ class LZMA(CompressorBase): raise DecompressionError(str(e)) from None -class ZSTD(CompressorBase): +class ZSTD(DecidingCompressor): """zstd compression / decompression (pypi: zstandard, gh: python-zstandard)""" # This is a NOT THREAD SAFE implementation. # Only ONE python context must to be created at a time. @@ -212,7 +274,12 @@ class ZSTD(CompressorBase): super().__init__(**kwargs) self.level = level - def compress(self, idata): + def _decide(self, idata): + """ + Decides what to do with *data*. Returns (compressor, zstd_data). + + *zstd_data* is the ZSTD result if *compressor* is ZSTD as well, otherwise it is None. + """ if not isinstance(idata, bytes): idata = bytes(idata) # code below does not work with memoryview cdef int isize = len(idata) @@ -227,7 +294,11 @@ class ZSTD(CompressorBase): osize = ZSTD_compress(dest, osize, source, isize, level) if ZSTD_isError(osize): raise Exception('zstd compress failed: %s' % ZSTD_getErrorName(osize)) - return super().compress(dest[:osize]) + # only compress if the result actually is smaller + if osize < isize: + return self, dest[:osize] + else: + return NONE_COMPRESSOR, None def decompress(self, idata): if not isinstance(idata, bytes): @@ -306,44 +377,54 @@ class Auto(CompressorBase): def _decide(self, data): """ - Decides what to do with *data*. Returns (compressor, lz4_data). + Decides what to do with *data*. Returns (compressor, compressed_data). - *lz4_data* is the LZ4 result if *compressor* is LZ4 as well, otherwise it is None. + *compressor* is the compressor that is decided to be best suited to compress + *data*, *compressed_data* is the result of *data* being compressed by a + compressor, which may or may not be *compressor*! + + There are three possible outcomes of the decision process: + * *data* compresses well enough for trying the more expensive compressor + set on instantiation to make sense. + In this case, (expensive_compressor_class, lz4_compressed_data) is + returned. + * *data* compresses only slightly using the LZ4 compressor, thus trying + the more expensive compressor for potentially little gain does not + make sense. + In this case, (LZ4_COMPRESSOR, lz4_compressed_data) is returned. + * *data* does not compress at all using LZ4, in this case + (NONE_COMPRESSOR, none_compressed_data) is returned. + + Note: While it makes no sense, the expensive compressor may well be set + to the LZ4 compressor. """ - lz4_data = LZ4_COMPRESSOR.compress(data) - # lz4_data includes the compression type header, while data does not yet - ratio = len(lz4_data) / (len(data) + 2) + compressor, compressed_data = LZ4_COMPRESSOR.decide_compress(data) + # compressed_data includes the compression type header, while data does not yet + ratio = len(compressed_data) / (len(data) + 2) if ratio < 0.97: - return self.compressor, lz4_data - elif ratio < 1: - return LZ4_COMPRESSOR, lz4_data + return self.compressor, compressed_data else: - return NONE_COMPRESSOR, None + return compressor, compressed_data def decide(self, data): return self._decide(data)[0] def compress(self, data): - compressor, lz4_data = self._decide(data) - if compressor is LZ4_COMPRESSOR: + compressor, cheap_compressed_data = self._decide(data) + if compressor in (LZ4_COMPRESSOR, NONE_COMPRESSOR): # we know that trying to compress with expensive compressor is likely pointless, - # but lz4 managed to at least squeeze the data a bit. - return lz4_data - if compressor is NONE_COMPRESSOR: - # we know that trying to compress with expensive compressor is likely pointless - # and also lz4 did not manage to squeeze the data (not even a bit). - uncompressed_data = compressor.compress(data) - return uncompressed_data + # so we fallback to return the cheap compressed data. + return cheap_compressed_data # if we get here, the decider decided to try the expensive compressor. - # we also know that lz4_data is smaller than uncompressed data. - exp_compressed_data = compressor.compress(data) - ratio = len(exp_compressed_data) / len(lz4_data) + # we also know that the compressed data returned by the decider is lz4 compressed. + expensive_compressed_data = compressor.compress(data) + ratio = len(expensive_compressed_data) / len(cheap_compressed_data) if ratio < 0.99: # the expensive compressor managed to squeeze the data significantly better than lz4. - return exp_compressed_data + return expensive_compressed_data else: # otherwise let's just store the lz4 data, which decompresses extremely fast. - return lz4_data + return cheap_compressed_data def decompress(self, data): raise NotImplementedError diff --git a/src/borg/remote.py b/src/borg/remote.py index a7699422a..99ea53d44 100644 --- a/src/borg/remote.py +++ b/src/borg/remote.py @@ -16,7 +16,7 @@ import traceback from subprocess import Popen, PIPE from . import __version__ -from .compress import LZ4 +from .compress import Compressor from .constants import * # NOQA from .helpers import Error, IntegrityError from .helpers import bin_to_hex @@ -1215,7 +1215,7 @@ def cache_if_remote(repository, *, decrypted_cache=False, pack=None, unpack=None key = decrypted_cache # 32 bit csize, 64 bit (8 byte) xxh64 cache_struct = struct.Struct('=I8s') - compressor = LZ4() + compressor = Compressor('lz4') def pack(data): csize, decrypted = data diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index e707e685b..08fd3278b 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -2140,7 +2140,7 @@ class ArchiverTestCase(ArchiverTestCaseBase): def test_compression_lz4_uncompressible(self): size, csize = self._get_sizes('lz4', compressible=False) - assert csize >= size + assert csize == size + 3 # same as compression 'none' def test_compression_lzma_compressible(self): size, csize = self._get_sizes('lzma', compressible=True) @@ -2148,7 +2148,15 @@ class ArchiverTestCase(ArchiverTestCaseBase): def test_compression_lzma_uncompressible(self): size, csize = self._get_sizes('lzma', compressible=False) - assert csize >= size + assert csize == size + 3 # same as compression 'none' + + def test_compression_zstd_compressible(self): + size, csize = self._get_sizes('zstd', compressible=True) + assert csize < size * 0.1 + + def test_compression_zstd_uncompressible(self): + size, csize = self._get_sizes('zstd', compressible=False) + assert csize == size + 3 # same as compression 'none' def test_change_passphrase(self): self.cmd('init', '--encryption=repokey', self.repository_location) diff --git a/src/borg/testsuite/compress.py b/src/borg/testsuite/compress.py index b7029e961..45b438b85 100644 --- a/src/borg/testsuite/compress.py +++ b/src/borg/testsuite/compress.py @@ -43,11 +43,13 @@ def test_lz4(): assert data == Compressor(**params).decompress(cdata) # autodetect -def test_lz4_buffer_allocation(): +def test_lz4_buffer_allocation(monkeypatch): + # disable fallback to no compression on incompressible data + monkeypatch.setattr(LZ4, 'decide', lambda always_compress: LZ4) # test with a rather huge data object to see if buffer allocation / resizing works data = os.urandom(5 * 2**20) * 10 # 50MiB badly compressible data assert len(data) == 50 * 2**20 - c = get_compressor(name='lz4') + c = Compressor('lz4') cdata = c.compress(data) assert len(cdata) > len(data) assert data == c.decompress(cdata) diff --git a/src/borg/testsuite/key.py b/src/borg/testsuite/key.py index b397435c2..1e09802ea 100644 --- a/src/borg/testsuite/key.py +++ b/src/borg/testsuite/key.py @@ -264,9 +264,8 @@ class TestKey: assert len(key.id_key) == 32 plaintext = b'123456789' authenticated = key.encrypt(plaintext) - # 0x07 is the key TYPE, 0x0100 identifies LZ4 compression, 0x90 is part of LZ4 and means that an uncompressed - # block of length nine follows (the plaintext). - assert authenticated == b'\x07\x01\x00\x90' + plaintext + # 0x07 is the key TYPE, \x0000 identifies no compression. + assert authenticated == b'\x07\x00\x00' + plaintext def test_blake2_authenticated_encrypt(self, monkeypatch): monkeypatch.setenv('BORG_PASSPHRASE', 'test') @@ -275,9 +274,8 @@ class TestKey: assert len(key.id_key) == 128 plaintext = b'123456789' authenticated = key.encrypt(plaintext) - # 0x06 is the key TYPE, 0x0100 identifies LZ4 compression, 0x90 is part of LZ4 and means that an uncompressed - # block of length nine follows (the plaintext). - assert authenticated == b'\x06\x01\x00\x90' + plaintext + # 0x06 is the key TYPE, 0x0000 identifies no compression. + assert authenticated == b'\x06\x00\x00' + plaintext class TestPassphrase: