diff --git a/borg/archiver.py b/borg/archiver.py index 84e568e73..032313dbf 100644 --- a/borg/archiver.py +++ b/borg/archiver.py @@ -14,6 +14,7 @@ import traceback from . import __version__ from .archive import Archive, ArchiveChecker, CHUNKER_PARAMS +from .compress import Compressor, COMPR_BUFFER from .repository import Repository from .cache import Cache from .key import key_creator @@ -21,7 +22,7 @@ from .helpers import Error, location_validator, format_time, format_file_size, \ format_file_mode, ExcludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \ get_cache_dir, get_keys_dir, format_timedelta, prune_within, prune_split, \ Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \ - is_cachedir, bigint_to_int, ChunkerParams + is_cachedir, bigint_to_int, ChunkerParams, CompressionSpec from .remote import RepositoryServer, RemoteRepository @@ -101,7 +102,9 @@ Type "Yes I am sure" if you understand this and want to continue.\n""") t0 = datetime.now() repository = self.open_repository(args.archive, exclusive=True) manifest, key = Manifest.load(repository) - key.compression_level = args.compression + compr_args = dict(buffer=COMPR_BUFFER) + compr_args.update(args.compression) + key.compressor = Compressor(**compr_args) cache = Cache(repository, key, manifest, do_files=args.cache_files) archive = Archive(repository, key, manifest, args.archive.archive, cache=cache, create=True, checkpoint_interval=args.checkpoint_interval, @@ -634,9 +637,16 @@ Type "Yes I am sure" if you understand this and want to continue.\n""") metavar='CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE', help='specify the chunker parameters. default: %d,%d,%d,%d' % CHUNKER_PARAMS) subparser.add_argument('-C', '--compression', dest='compression', - type=int, default=0, metavar='N', - help='select compression algorithm and level. 0..9 is supported and means zlib ' - 'level 0 (no compression, fast, default) .. zlib level 9 (high compression, slow).') + type=CompressionSpec, default=dict(name='null'), metavar='COMPRESSION', + help='select compression algorithm and level, by giving a number: ' + '0 == no compression [default], ' + '1..9 == zlib level 1..9, ' + '10 == lz4. ' + 'Alternatively, you can also give a name and optionally additional args: ' + 'null == no compression, ' + 'zlib == zlib (default level 6), ' + 'zlib,0 .. zlib,9 == zlib (with level 0..9), ' + 'lz4 == lz4.') subparser.add_argument('archive', metavar='ARCHIVE', type=location_validator(archive=True), help='archive to create') diff --git a/borg/compress.pyx b/borg/compress.pyx index 1ff00305f..03815b3a5 100644 --- a/borg/compress.pyx +++ b/borg/compress.pyx @@ -1,8 +1,5 @@ import zlib -from libc.stdlib cimport malloc, free - - cdef extern from "lz4.h": int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil @@ -40,7 +37,15 @@ class CNULL(CompressorBase): """ ID = b'\x00\x00' name = 'null' - # base class does all we need + + def compress(self, data): + return super().compress(data) + + def decompress(self, data): + data = super().decompress(data) + if not isinstance(data, bytes): + data = bytes(data) + return data cdef class LZ4(CompressorBase): @@ -71,6 +76,8 @@ cdef class LZ4(CompressorBase): self.bufsize = len(buffer) def compress(self, idata): + if not isinstance(idata, bytes): + idata = bytes(idata) # code below does not work with memoryview cdef int isize = len(idata) cdef int osize = self.bufsize cdef char *source = idata @@ -82,6 +89,8 @@ cdef class LZ4(CompressorBase): return super().compress(dest[:osize]) def decompress(self, idata): + if not isinstance(idata, bytes): + idata = bytes(idata) # code below does not work with memoryview idata = super().decompress(idata) cdef int isize = len(idata) cdef int osize = self.bufsize @@ -141,7 +150,7 @@ class Compressor: compresses using a compressor with given name and parameters decompresses everything we can handle (autodetect) """ - def __init__(self, name='zlib', **kwargs): + def __init__(self, name='null', **kwargs): self.params = kwargs self.compressor = get_compressor(name, **self.params) @@ -149,8 +158,14 @@ class Compressor: return self.compressor.compress(data) def decompress(self, data): + hdr = bytes(data[:2]) # detect() does not work with memoryview for cls in COMPRESSOR_LIST: - if cls.detect(data): + if cls.detect(hdr): return cls(**self.params).decompress(data) else: raise ValueError('No decompressor for this data found: %r.', data[:2]) + + +# a buffer used for (de)compression result, which can be slightly bigger +# than the chunk buffer in the worst (incompressible data) case, add 10%: +COMPR_BUFFER = bytes(int(1.1 * 2 ** 23)) # CHUNK_MAX_EXP == 23 diff --git a/borg/helpers.py b/borg/helpers.py index d20532723..69a6db0db 100644 --- a/borg/helpers.py +++ b/borg/helpers.py @@ -278,9 +278,49 @@ def timestamp(s): def ChunkerParams(s): window_size, chunk_mask, chunk_min, chunk_max = s.split(',') + if int(chunk_max) > 23: + # do not go beyond 2**23 (8MB) chunk size now, + # COMPR_BUFFER can only cope with up to this size + raise ValueError return int(window_size), int(chunk_mask), int(chunk_min), int(chunk_max) +def CompressionSpec(s): + values = s.split(',') + count = len(values) + if count < 1: + raise ValueError + compression = values[0] + try: + compression = int(compression) + if count > 1: + raise ValueError + # it is just --compression N + if compression == 0: + return dict(name='null') + if 1 <= compression <= 9: + return dict(name='zlib', level=compression) + if compression == 10: + return dict(name='lz4') + raise ValueError + except ValueError: + # --compression algo[,...] + name = compression + if name in ('null', 'lz4', ): + return dict(name=name) + if name == 'zlib': + if count < 2: + level = 6 # default compression level in py stdlib + elif count == 2: + level = int(values[1]) + if not 0 <= level <= 9: + raise ValueError + else: + raise ValueError + return dict(name='zlib', level=level) + raise ValueError + + def is_cachedir(path): """Determines whether the specified path is a cache directory (and therefore should potentially be excluded from the backup) according to diff --git a/borg/key.py b/borg/key.py index fabdae5b3..fcf083586 100644 --- a/borg/key.py +++ b/borg/key.py @@ -6,9 +6,9 @@ import msgpack import textwrap import hmac from hashlib import sha256 -import zlib from .crypto import pbkdf2_sha256, get_random_bytes, AES, bytes_to_long, long_to_bytes, bytes_to_int, num_aes_blocks +from .compress import Compressor, COMPR_BUFFER from .helpers import IntegrityError, get_keys_dir, Error PREFIX = b'\0' * 8 @@ -68,7 +68,7 @@ class KeyBase: self.TYPE_STR = bytes([self.TYPE]) self.repository = repository self.target = None # key location file path / repo obj - self.compression_level = 0 + self.compressor = Compressor('null', buffer=COMPR_BUFFER) def id_hash(self, data): """Return HMAC hash using the "id" HMAC key @@ -99,12 +99,12 @@ class PlaintextKey(KeyBase): return sha256(data).digest() def encrypt(self, data): - return b''.join([self.TYPE_STR, zlib.compress(data, self.compression_level)]) + return b''.join([self.TYPE_STR, self.compressor.compress(data)]) def decrypt(self, id, data): if data[0] != self.TYPE: raise IntegrityError('Invalid encryption envelope') - data = zlib.decompress(memoryview(data)[1:]) + data = self.compressor.decompress(memoryview(data)[1:]) if id and sha256(data).digest() != id: raise IntegrityError('Chunk id verification failed') return data @@ -131,7 +131,7 @@ class AESKeyBase(KeyBase): return HMAC(self.id_key, data, sha256).digest() def encrypt(self, data): - data = zlib.compress(data, self.compression_level) + data = self.compressor.compress(data) self.enc_cipher.reset() data = b''.join((self.enc_cipher.iv[8:], self.enc_cipher.encrypt(data))) hmac = HMAC(self.enc_hmac_key, data, sha256).digest() @@ -144,7 +144,7 @@ class AESKeyBase(KeyBase): if memoryview(HMAC(self.enc_hmac_key, memoryview(data)[33:], sha256).digest()) != hmac: raise IntegrityError('Encryption envelope checksum mismatch') self.dec_cipher.reset(iv=PREFIX + data[33:41]) - data = zlib.decompress(self.dec_cipher.decrypt(data[41:])) # should use memoryview + data = self.compressor.decompress(self.dec_cipher.decrypt(data[41:])) if id and HMAC(self.id_key, data, sha256).digest() != id: raise IntegrityError('Chunk id verification failed') return data